From c3cc4a1be02d4514988f9a8473ba5ac18db161ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 11 Oct 2025 12:09:44 -0300 Subject: [PATCH 01/65] refactor(parser): use integer parsing functions from stdlib --- .../_libs/include/pandas/parser/tokenizer.h | 1 + pandas/_libs/parsers.pyx | 6 +- pandas/_libs/src/parser/tokenizer.c | 282 ++++++++---------- 3 files changed, 136 insertions(+), 153 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 209f375a5bf6c..0d6bb6df3d123 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,6 +17,7 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 +#define ERROR_NO_MEMORY 4 #include diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 442891949dfd2..0c93d1c565f1c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h": SKIP_LINE FINISHED - enum: ERROR_OVERFLOW + enum: ERROR_OVERFLOW, ERROR_NO_MEMORY ctypedef enum BadLineHandleMethod: ERROR, @@ -1822,6 +1822,8 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") + if error == ERROR_NO_MEMORY: + raise MemoryError() return None if uint64_conflict(&state): @@ -1892,6 +1894,8 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") + if error == ERROR_NO_MEMORY: + raise MemoryError() return None, None return result, na_count diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 61e96fc835e4d..8a3dd7986c5e8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,6 +23,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include +#include #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 @@ -1834,201 +1835,178 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } +/** + * @brief Check if the character in the pointer indicates a number. + * It expects that you consumed all leading whitespace. + * + * @param p_item Pointer to verify + * @return Non-zero integer indicating that has a digit 0 otherwise. + */ +static inline int has_digit_int(const char *str) { + if (!str || *str == '\0') { + return 0; + } + + switch (*str) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + return 1; + case '+': + case '-': + return isdigit_ascii(str[1]); + default: + return 0; + } +} + +static inline int has_only_spaces(const char *str) { + while (*str != '\0' && isspace_ascii(*str)) { + str++; + } + return *str == '\0'; +} + +/* Copy a string without `char_to_remove`. + * The returned memory should be free-d with a call to `free`. + */ +static char *copy_string_without_char(const char *str, char char_to_remove) { + size_t chars_to_copy = 0; + for (const char *src = str; *src != '\0'; src++) { + if (*src != char_to_remove) { + chars_to_copy++; + } + } + + char *start = malloc((chars_to_copy + 1) * sizeof(char)); + if (!start) { + return NULL; + } + + char *dst = start; + for (const char *src = str; *src != '\0'; src++) { + if (*src != char_to_remove) { + *dst++ = *src; + } + } + *dst = '\0'; + + return start; +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; + if (!p_item || *p_item == '\0') { + *error = ERROR_NO_DIGITS; + return 0; } - // Handle sign. - const bool isneg = *p == '-' ? true : false; - // Handle sign. - if (isneg || (*p == '+')) { - p++; + while (isspace_ascii(*p_item)) { + ++p_item; } - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... + if (!has_digit_int(p_item)) { *error = ERROR_NO_DIGITS; return 0; } - int64_t number = 0; - if (isneg) { - // If number is greater than pre_min, at least one more digit - // can be processed without overflowing. - int dig_pre_min = -(int_min % 10); - int64_t pre_min = int_min / 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } - } else { - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - int64_t pre_max = int_max / 10; - int dig_pre_max = int_max % 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; + char *processed_str = NULL; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + if (tsep != '\0' && strchr(p_item, tsep) != NULL) { + processed_str = copy_string_without_char(p_item, tsep); + if (!processed_str) { + *error = ERROR_NO_MEMORY; + return 0; } + p_item = processed_str; } - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } + char *endptr = NULL; + errno = 0; + int64_t result = strtoll(p_item, &endptr, 10); - // Did we use up all the characters? - if (*p) { + if (!has_only_spaces(endptr)) { + // Check first for invalid characters because we may + // want to skip integer parsing if we find one. *error = ERROR_INVALID_CHARS; - return 0; + result = 0; + } else if (errno == ERANGE || result > int_max || result < int_min) { + *error = ERROR_OVERFLOW; + result = 0; + } else { + *error = 0; } - *error = 0; - return number; + // free processed_str that + // was either allocated due to the presence of tsep + // or is NULL + free(processed_str); + + return result; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; + if (!p_item || *p_item == '\0') { + *error = ERROR_NO_DIGITS; + return 0; } - // Handle sign. - if (*p == '-') { + while (isspace_ascii(*p_item)) { + ++p_item; + } + + if (*p_item == '-') { state->seen_sint = 1; *error = 0; return 0; - } else if (*p == '+') { - p++; + } else if (*p_item == '+') { + p_item++; } // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... + if (!isdigit_ascii(*p_item)) { *error = ERROR_NO_DIGITS; return 0; } - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - // - // Process the digits. - uint64_t number = 0; - const uint64_t pre_max = uint_max / 10; - const uint64_t dig_pre_max = uint_max % 10; - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; + char *processed_str = NULL; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } + if (tsep != '\0' && strchr(p_item, tsep) != NULL) { + processed_str = copy_string_without_char(p_item, tsep); + if (!processed_str) { + *error = ERROR_NO_MEMORY; + return 0; } + p_item = processed_str; } - // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; - } + errno = 0; + char *endptr = NULL; + uint64_t result = strtoull(p_item, &endptr, 10); - // Did we use up all the characters? - if (*p) { + if (!has_only_spaces(endptr)) { *error = ERROR_INVALID_CHARS; - return 0; + result = 0; + } else if (errno == ERANGE || result > uint_max) { + *error = ERROR_OVERFLOW; + result = 0; + } else { + *error = 0; } - if (number > (uint64_t)int_max) { + if (result > (uint64_t)int_max) { state->seen_uint = 1; } - *error = 0; - return number; + free(processed_str); + + return result; } From 24593130034e4747ecea9291e83130ef6854ae74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 15:22:22 -0300 Subject: [PATCH 02/65] perf: use a local buffer to store the processed string --- .../_libs/include/pandas/parser/tokenizer.h | 1 - pandas/_libs/parsers.pyx | 6 +- pandas/_libs/src/parser/tokenizer.c | 77 ++++++++----------- 3 files changed, 33 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 0d6bb6df3d123..209f375a5bf6c 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,7 +17,6 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#define ERROR_NO_MEMORY 4 #include diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 0c93d1c565f1c..442891949dfd2 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h": SKIP_LINE FINISHED - enum: ERROR_OVERFLOW, ERROR_NO_MEMORY + enum: ERROR_OVERFLOW ctypedef enum BadLineHandleMethod: ERROR, @@ -1822,8 +1822,6 @@ cdef _try_uint64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - if error == ERROR_NO_MEMORY: - raise MemoryError() return None if uint64_conflict(&state): @@ -1894,8 +1892,6 @@ cdef _try_int64(parser_t *parser, int64_t col, if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") - if error == ERROR_NO_MEMORY: - raise MemoryError() return None, None return result, na_count diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8a3dd7986c5e8..69c1c141e09cd 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -28,6 +28,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 +static const int PROCESSED_WORD_CAPACITY = 128; + void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { // column i, starting at 0 @@ -1874,31 +1876,23 @@ static inline int has_only_spaces(const char *str) { return *str == '\0'; } -/* Copy a string without `char_to_remove`. - * The returned memory should be free-d with a call to `free`. +/* Copy a string without `char_to_remove` into `output`, + * while ensuring it's null terminated. */ -static char *copy_string_without_char(const char *str, char char_to_remove) { - size_t chars_to_copy = 0; - for (const char *src = str; *src != '\0'; src++) { +static void copy_string_without_char(char *output, const char *str, + char char_to_remove, size_t output_size) { + size_t i = 0; + for (const char *src = str; *src != '\0' && i < output_size; src++) { if (*src != char_to_remove) { - chars_to_copy++; + output[i++] = *src; } } - - char *start = malloc((chars_to_copy + 1) * sizeof(char)); - if (!start) { - return NULL; - } - - char *dst = start; - for (const char *src = str; *src != '\0'; src++) { - if (*src != char_to_remove) { - *dst++ = *src; - } + if (i < output_size) { + output[i] = '\0'; + } else { + // str is too big, probably would overflow + errno = ERANGE; } - *dst = '\0'; - - return start; } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, @@ -1917,19 +1911,19 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, return 0; } - char *processed_str = NULL; - + errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - processed_str = copy_string_without_char(p_item, tsep); - if (!processed_str) { - *error = ERROR_NO_MEMORY; - return 0; - } - p_item = processed_str; + char buffer[PROCESSED_WORD_CAPACITY]; + copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY); + p_item = buffer; + } + + if (errno == ERANGE) { + *error = ERROR_OVERFLOW; + return 0; } char *endptr = NULL; - errno = 0; int64_t result = strtoll(p_item, &endptr, 10); if (!has_only_spaces(endptr)) { @@ -1944,11 +1938,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, *error = 0; } - // free processed_str that - // was either allocated due to the presence of tsep - // or is NULL - free(processed_str); - return result; } @@ -1977,18 +1966,18 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - char *processed_str = NULL; - + errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - processed_str = copy_string_without_char(p_item, tsep); - if (!processed_str) { - *error = ERROR_NO_MEMORY; - return 0; - } - p_item = processed_str; + char buffer[PROCESSED_WORD_CAPACITY]; + copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY); + p_item = buffer; + } + + if (errno == ERANGE) { + *error = ERROR_OVERFLOW; + return 0; } - errno = 0; char *endptr = NULL; uint64_t result = strtoull(p_item, &endptr, 10); @@ -2006,7 +1995,5 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, state->seen_uint = 1; } - free(processed_str); - return result; } From d8a454e7d21153cf2ef46dc095d7fe18b014b71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 15:51:13 -0300 Subject: [PATCH 03/65] fix: use macro to fix MSVC build error --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 69c1c141e09cd..08805d7146549 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -28,7 +28,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 -static const int PROCESSED_WORD_CAPACITY = 128; +#define PROCESSED_WORD_CAPACITY 128 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { From 87789e6483b3e9f4cd037ad7f229d0dd54d68654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 16:11:45 -0300 Subject: [PATCH 04/65] fix: use `bool` --- pandas/_libs/src/parser/tokenizer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 08805d7146549..9ef3c0c5c2197 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1844,9 +1844,9 @@ int uint64_conflict(uint_state *self) { * @param p_item Pointer to verify * @return Non-zero integer indicating that has a digit 0 otherwise. */ -static inline int has_digit_int(const char *str) { +static inline bool has_digit_int(const char *str) { if (!str || *str == '\0') { - return 0; + return false; } switch (*str) { @@ -1860,16 +1860,16 @@ static inline int has_digit_int(const char *str) { case '7': case '8': case '9': - return 1; + return true; case '+': case '-': return isdigit_ascii(str[1]); default: - return 0; + return false; } } -static inline int has_only_spaces(const char *str) { +static inline bool has_only_spaces(const char *str) { while (*str != '\0' && isspace_ascii(*str)) { str++; } From 228794473ed04330ea8cf71c9b96603b9f2275ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 16:14:40 -0300 Subject: [PATCH 05/65] refactor: don't pass PROCESSED_WORD_CAPACITY as a separate argument --- pandas/_libs/src/parser/tokenizer.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 9ef3c0c5c2197..f94e97f51db83 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1879,15 +1879,16 @@ static inline bool has_only_spaces(const char *str) { /* Copy a string without `char_to_remove` into `output`, * while ensuring it's null terminated. */ -static void copy_string_without_char(char *output, const char *str, - char char_to_remove, size_t output_size) { +static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], + const char *str, char char_to_remove) { size_t i = 0; - for (const char *src = str; *src != '\0' && i < output_size; src++) { + for (const char *src = str; *src != '\0' && i < PROCESSED_WORD_CAPACITY; + src++) { if (*src != char_to_remove) { output[i++] = *src; } } - if (i < output_size) { + if (i < PROCESSED_WORD_CAPACITY) { output[i] = '\0'; } else { // str is too big, probably would overflow @@ -1914,7 +1915,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { char buffer[PROCESSED_WORD_CAPACITY]; - copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY); + copy_string_without_char(buffer, p_item, tsep); p_item = buffer; } @@ -1969,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { char buffer[PROCESSED_WORD_CAPACITY]; - copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY); + copy_string_without_char(buffer, p_item, tsep); p_item = buffer; } From 2bea3c228d13c426f20cb9c40e4c97a8b4b7deec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 17:24:43 -0300 Subject: [PATCH 06/65] perf: write in chunks --- pandas/_libs/src/parser/tokenizer.c | 41 ++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index f94e97f51db83..77b97d7b421e8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1877,22 +1877,37 @@ static inline bool has_only_spaces(const char *str) { } /* Copy a string without `char_to_remove` into `output`, - * while ensuring it's null terminated. + * it assumes that output is filled with `\0`, + * so it won't null terminate the result. */ static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *str, char char_to_remove) { - size_t i = 0; - for (const char *src = str; *src != '\0' && i < PROCESSED_WORD_CAPACITY; - src++) { - if (*src != char_to_remove) { - output[i++] = *src; + char *dst = output; + const char *src = str; + // last character is reserved for null terminator. + const char *end = output + PROCESSED_WORD_CAPACITY - 1; + + while (*src != '\0' && dst < end) { + const char *next = src; + // find EOS or char_to_remove + while (*next != '\0' && *next != char_to_remove) { + next++; } - } - if (i < PROCESSED_WORD_CAPACITY) { - output[i] = '\0'; - } else { - // str is too big, probably would overflow - errno = ERANGE; + + size_t len = next - src; + if (dst + len > end) { + // Can't write here, str is too big + errno = ERANGE; + return; + } + + // copy block + memcpy(dst, src, len); + + // go to next available location to write + dst += len; + // Move past char to remove + src = *next == char_to_remove ? next + 1 : next; } } @@ -1915,6 +1930,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { char buffer[PROCESSED_WORD_CAPACITY]; + memset(buffer, '\0', sizeof(buffer)); copy_string_without_char(buffer, p_item, tsep); p_item = buffer; } @@ -1970,6 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, errno = 0; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { char buffer[PROCESSED_WORD_CAPACITY]; + memset(buffer, '\0', sizeof(buffer)); copy_string_without_char(buffer, p_item, tsep); p_item = buffer; } From fb386798e5b9c3848b8cb4981fbfa3f32efdccec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 17:40:28 -0300 Subject: [PATCH 07/65] hack: try bigger buffer size for arm error --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 77b97d7b421e8..cab072269fc46 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -28,7 +28,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 -#define PROCESSED_WORD_CAPACITY 128 +#define PROCESSED_WORD_CAPACITY 256 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { From f9ede5c979a73b0af78609c8036234ec9498d50f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 20:09:06 -0300 Subject: [PATCH 08/65] fix: solution without manipulating the string --- pandas/_libs/src/parser/tokenizer.c | 119 ++++++++++++++++------------ 1 file changed, 68 insertions(+), 51 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index cab072269fc46..0475f1aec7938 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -21,6 +21,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include +#include #include #include #include @@ -1876,39 +1877,51 @@ static inline bool has_only_spaces(const char *str) { return *str == '\0'; } -/* Copy a string without `char_to_remove` into `output`, - * it assumes that output is filled with `\0`, - * so it won't null terminate the result. - */ -static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], - const char *str, char char_to_remove) { - char *dst = output; - const char *src = str; - // last character is reserved for null terminator. - const char *end = output + PROCESSED_WORD_CAPACITY - 1; - - while (*src != '\0' && dst < end) { - const char *next = src; - // find EOS or char_to_remove - while (*next != '\0' && *next != char_to_remove) { - next++; +static inline int power_int(int base, int exponent) { + // https://en.wikipedia.org/wiki/Exponentiation_by_squaring + int result = 1; + + while (exponent > 1) { + if (exponent % 2 == 1) { + result *= base; + exponent--; } + result *= result; + exponent /= 2; + } + + return result * base; +} - size_t len = next - src; - if (dst + len > end) { - // Can't write here, str is too big +static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs, + int64_t mul_lhs) { + // rhs will always be positive, because this function + // only executes after the first parse, hence the sign will always go to lhs. + // if lhs > 0: + // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX + // iff lhs > (INT_MAX - rhs) / mul_lhs + // if lhs < 0: + // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN + // iff lhs < (INT_MIN + rhs) / mul_lhs + if (lhs >= 0) { + if (lhs > (INT_MAX - rhs) / mul_lhs) { errno = ERANGE; - return; } + } else { + if (lhs < (INT_MIN + rhs) / mul_lhs) { + errno = ERANGE; + } + rhs = -rhs; + } + return lhs * mul_lhs + rhs; +} - // copy block - memcpy(dst, src, len); - - // go to next available location to write - dst += len; - // Move past char to remove - src = *next == char_to_remove ? next + 1 : next; +static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs, + uint64_t mul_lhs) { + if (lhs > (UINT_MAX - rhs) / mul_lhs) { + errno = ERANGE; } + return lhs * mul_lhs + rhs; } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, @@ -1928,21 +1941,23 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } errno = 0; - if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - char buffer[PROCESSED_WORD_CAPACITY]; - memset(buffer, '\0', sizeof(buffer)); - copy_string_without_char(buffer, p_item, tsep); - p_item = buffer; - } - - if (errno == ERANGE) { - *error = ERROR_OVERFLOW; - return 0; - } - char *endptr = NULL; int64_t result = strtoll(p_item, &endptr, 10); + while (errno == 0 && tsep != '\0' && *endptr == tsep) { + // Skip multiple consecutive tsep + while (*endptr == tsep) { + endptr++; + } + + char *new_end = NULL; + int64_t next_part = strtoll(endptr, &new_end, 10); + int digits = new_end - endptr; + int mul_result = power_int(10, digits); + result = add_int_check_overflow(result, next_part, mul_result); + endptr = new_end; + } + if (!has_only_spaces(endptr)) { // Check first for invalid characters because we may // want to skip integer parsing if we find one. @@ -1984,21 +1999,23 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } errno = 0; - if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - char buffer[PROCESSED_WORD_CAPACITY]; - memset(buffer, '\0', sizeof(buffer)); - copy_string_without_char(buffer, p_item, tsep); - p_item = buffer; - } - - if (errno == ERANGE) { - *error = ERROR_OVERFLOW; - return 0; - } - char *endptr = NULL; uint64_t result = strtoull(p_item, &endptr, 10); + while (errno == 0 && tsep != '\0' && *endptr == tsep) { + // Skip multiple consecutive tsep + while (*endptr == tsep) { + endptr++; + } + + char *new_end = NULL; + uint64_t next_part = strtoull(endptr, &new_end, 10); + int digits = new_end - endptr; + int mul_result = power_int(10, digits); + result = add_uint_check_overflow(result, next_part, mul_result); + endptr = new_end; + } + if (!has_only_spaces(endptr)) { *error = ERROR_INVALID_CHARS; result = 0; From 798c263f4e6bbb5158947c41a75261ed8ffd4a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 20:23:24 -0300 Subject: [PATCH 09/65] some cleanup --- pandas/_libs/src/parser/tokenizer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0475f1aec7938..f36607bffe5b8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -29,8 +29,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 -#define PROCESSED_WORD_CAPACITY 256 - void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { // column i, starting at 0 @@ -1877,7 +1875,7 @@ static inline bool has_only_spaces(const char *str) { return *str == '\0'; } -static inline int power_int(int base, int exponent) { +static int power_int(int base, int exponent) { // https://en.wikipedia.org/wiki/Exponentiation_by_squaring int result = 1; @@ -1953,7 +1951,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *new_end = NULL; int64_t next_part = strtoll(endptr, &new_end, 10); int digits = new_end - endptr; - int mul_result = power_int(10, digits); + int64_t mul_result = power_int(10, digits); result = add_int_check_overflow(result, next_part, mul_result); endptr = new_end; } @@ -2011,7 +2009,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, char *new_end = NULL; uint64_t next_part = strtoull(endptr, &new_end, 10); int digits = new_end - endptr; - int mul_result = power_int(10, digits); + uint64_t mul_result = power_int(10, digits); result = add_uint_check_overflow(result, next_part, mul_result); endptr = new_end; } From 2f06f192e70e28200499160e4e58c242f9eb2476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 20:33:44 -0300 Subject: [PATCH 10/65] fix: use ptrdiff_t to fix MSVC build error --- pandas/_libs/src/parser/tokenizer.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index f36607bffe5b8..52ade83592702 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -24,6 +24,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include +#include #include #include "pandas/portable.h" @@ -1950,8 +1951,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *new_end = NULL; int64_t next_part = strtoll(endptr, &new_end, 10); - int digits = new_end - endptr; - int64_t mul_result = power_int(10, digits); + ptrdiff_t digits = new_end - endptr; + int64_t mul_result = power_int(10, (int)digits); result = add_int_check_overflow(result, next_part, mul_result); endptr = new_end; } @@ -2008,8 +2009,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, char *new_end = NULL; uint64_t next_part = strtoull(endptr, &new_end, 10); - int digits = new_end - endptr; - uint64_t mul_result = power_int(10, digits); + ptrdiff_t digits = new_end - endptr; + uint64_t mul_result = power_int(10, (int)digits); result = add_uint_check_overflow(result, next_part, mul_result); endptr = new_end; } From 280b55e22b24b47109e1d90a2125c9dc22e81057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Mon, 13 Oct 2025 22:15:18 -0300 Subject: [PATCH 11/65] add other exponent cases for completion --- pandas/_libs/src/parser/tokenizer.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 52ade83592702..c4409bbf0803f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1878,6 +1878,12 @@ static inline bool has_only_spaces(const char *str) { static int power_int(int base, int exponent) { // https://en.wikipedia.org/wiki/Exponentiation_by_squaring + if (exponent == 0) { + return 1; + } else if (exponent < 0) { + return 0; + } + int result = 1; while (exponent > 1) { From d85aaf02bd85f12459ca7dd0039f4a9a1ee468b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 09:29:54 -0300 Subject: [PATCH 12/65] fix: use builtin overflow check verification --- pandas/_libs/src/parser/tokenizer.c | 59 +++++++++++++---------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c4409bbf0803f..8d3608ee81afa 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -24,6 +24,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include +#include #include #include @@ -1898,37 +1899,6 @@ static int power_int(int base, int exponent) { return result * base; } -static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs, - int64_t mul_lhs) { - // rhs will always be positive, because this function - // only executes after the first parse, hence the sign will always go to lhs. - // if lhs > 0: - // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX - // iff lhs > (INT_MAX - rhs) / mul_lhs - // if lhs < 0: - // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN - // iff lhs < (INT_MIN + rhs) / mul_lhs - if (lhs >= 0) { - if (lhs > (INT_MAX - rhs) / mul_lhs) { - errno = ERANGE; - } - } else { - if (lhs < (INT_MIN + rhs) / mul_lhs) { - errno = ERANGE; - } - rhs = -rhs; - } - return lhs * mul_lhs + rhs; -} - -static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs, - uint64_t mul_lhs) { - if (lhs > (UINT_MAX - rhs) / mul_lhs) { - errno = ERANGE; - } - return lhs * mul_lhs + rhs; -} - int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { if (!p_item || *p_item == '\0') { @@ -1948,6 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; char *endptr = NULL; int64_t result = strtoll(p_item, &endptr, 10); + bool is_negative = result < 0; while (errno == 0 && tsep != '\0' && *endptr == tsep) { // Skip multiple consecutive tsep @@ -1957,9 +1928,22 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *new_end = NULL; int64_t next_part = strtoll(endptr, &new_end, 10); + if (is_negative) { + next_part = -next_part; + } + ptrdiff_t digits = new_end - endptr; int64_t mul_result = power_int(10, (int)digits); - result = add_int_check_overflow(result, next_part, mul_result); + // result * mul_result + if (ckd_mul(&result, result, mul_result)) { + // overflow + errno = ERANGE; + } + // result + next_part + if (ckd_add(&result, result, next_part)) { + // overflow + errno = ERANGE; + } endptr = new_end; } @@ -2017,7 +2001,16 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t next_part = strtoull(endptr, &new_end, 10); ptrdiff_t digits = new_end - endptr; uint64_t mul_result = power_int(10, (int)digits); - result = add_uint_check_overflow(result, next_part, mul_result); + // result * mul_result + if (ckd_mul(&result, result, mul_result)) { + // overflow + errno = ERANGE; + } + // result + next_part + if (ckd_add(&result, result, next_part)) { + // overflow + errno = ERANGE; + } endptr = new_end; } From 9046eccb8e49c945ac9f6184fa8a9a54944372e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 09:34:47 -0300 Subject: [PATCH 13/65] fix: change std to c2x --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 6a00e52481108..c774be30d8562 100644 --- a/meson.build +++ b/meson.build @@ -7,7 +7,7 @@ project( version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.2.1', - default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'], + default_options: ['buildtype=release', 'c_std=c2x', 'warning_level=2'], ) fs = import('fs') From a4e2fb8ec720e8bb363a5832611cd02cd2272e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 09:45:59 -0300 Subject: [PATCH 14/65] Revert previous commits Revert "fix: change std to c2x" This reverts commit 9046eccb8e49c945ac9f6184fa8a9a54944372e7. Revert "fix: use builtin overflow check verification" This reverts commit d85aaf02bd85f12459ca7dd0039f4a9a1ee468b5. --- meson.build | 2 +- pandas/_libs/src/parser/tokenizer.c | 59 ++++++++++++++++------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/meson.build b/meson.build index c774be30d8562..6a00e52481108 100644 --- a/meson.build +++ b/meson.build @@ -7,7 +7,7 @@ project( version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(), license: 'BSD-3', meson_version: '>=1.2.1', - default_options: ['buildtype=release', 'c_std=c2x', 'warning_level=2'], + default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'], ) fs = import('fs') diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8d3608ee81afa..c4409bbf0803f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -24,7 +24,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include -#include #include #include @@ -1899,6 +1898,37 @@ static int power_int(int base, int exponent) { return result * base; } +static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs, + int64_t mul_lhs) { + // rhs will always be positive, because this function + // only executes after the first parse, hence the sign will always go to lhs. + // if lhs > 0: + // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX + // iff lhs > (INT_MAX - rhs) / mul_lhs + // if lhs < 0: + // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN + // iff lhs < (INT_MIN + rhs) / mul_lhs + if (lhs >= 0) { + if (lhs > (INT_MAX - rhs) / mul_lhs) { + errno = ERANGE; + } + } else { + if (lhs < (INT_MIN + rhs) / mul_lhs) { + errno = ERANGE; + } + rhs = -rhs; + } + return lhs * mul_lhs + rhs; +} + +static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs, + uint64_t mul_lhs) { + if (lhs > (UINT_MAX - rhs) / mul_lhs) { + errno = ERANGE; + } + return lhs * mul_lhs + rhs; +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { if (!p_item || *p_item == '\0') { @@ -1918,7 +1948,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; char *endptr = NULL; int64_t result = strtoll(p_item, &endptr, 10); - bool is_negative = result < 0; while (errno == 0 && tsep != '\0' && *endptr == tsep) { // Skip multiple consecutive tsep @@ -1928,22 +1957,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *new_end = NULL; int64_t next_part = strtoll(endptr, &new_end, 10); - if (is_negative) { - next_part = -next_part; - } - ptrdiff_t digits = new_end - endptr; int64_t mul_result = power_int(10, (int)digits); - // result * mul_result - if (ckd_mul(&result, result, mul_result)) { - // overflow - errno = ERANGE; - } - // result + next_part - if (ckd_add(&result, result, next_part)) { - // overflow - errno = ERANGE; - } + result = add_int_check_overflow(result, next_part, mul_result); endptr = new_end; } @@ -2001,16 +2017,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t next_part = strtoull(endptr, &new_end, 10); ptrdiff_t digits = new_end - endptr; uint64_t mul_result = power_int(10, (int)digits); - // result * mul_result - if (ckd_mul(&result, result, mul_result)) { - // overflow - errno = ERANGE; - } - // result + next_part - if (ckd_add(&result, result, next_part)) { - // overflow - errno = ERANGE; - } + result = add_uint_check_overflow(result, next_part, mul_result); endptr = new_end; } From ef82cf4b218aa7df99ac277f88b14a94ad596a87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 09:58:39 -0300 Subject: [PATCH 15/65] refactor: move overflow check to header --- .../vendored/numpy/datetime/np_datetime.h | 29 +++++++++++++++++++ .../src/vendored/numpy/datetime/np_datetime.c | 29 ------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h index e4e90a7ea24cf..6ae24bf58cf24 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h @@ -20,6 +20,35 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API +#if defined(_WIN32) +#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS +#define ENABLE_INTSAFE_SIGNED_FUNCTIONS +#endif +#include +#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) +#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) +#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#else +#if defined __has_builtin +#if __has_builtin(__builtin_add_overflow) +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, + "Overflow checking not detected; please try a newer compiler"); +#endif +// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment +// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that +#elif __GNUC__ > 7 +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); +#endif +#endif + #include typedef struct { diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 9a022095feee9..eab58e915e247 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -27,35 +27,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #include #include -#if defined(_WIN32) -#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS -#define ENABLE_INTSAFE_SIGNED_FUNCTIONS -#endif -#include -#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) -#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) -#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) -#else -#if defined __has_builtin -#if __has_builtin(__builtin_add_overflow) -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, - "Overflow checking not detected; please try a newer compiler"); -#endif -// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment -// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that -#elif __GNUC__ > 7 -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); -#endif -#endif - #define XSTR(a) STR(a) #define STR(a) #a From 5afeb1101709bdd5d66678d439bbde7ade7a70f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 09:59:02 -0300 Subject: [PATCH 16/65] refactor: use overflow check from numpy --- pandas/_libs/src/parser/tokenizer.c | 35 +++++++++-------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c4409bbf0803f..4dac197596af5 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -18,6 +18,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these. */ #include "pandas/parser/tokenizer.h" #include "pandas/portable.h" +#include "pandas/vendored/numpy/datetime/np_datetime.h" #include #include @@ -1898,29 +1899,6 @@ static int power_int(int base, int exponent) { return result * base; } -static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs, - int64_t mul_lhs) { - // rhs will always be positive, because this function - // only executes after the first parse, hence the sign will always go to lhs. - // if lhs > 0: - // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX - // iff lhs > (INT_MAX - rhs) / mul_lhs - // if lhs < 0: - // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN - // iff lhs < (INT_MIN + rhs) / mul_lhs - if (lhs >= 0) { - if (lhs > (INT_MAX - rhs) / mul_lhs) { - errno = ERANGE; - } - } else { - if (lhs < (INT_MIN + rhs) / mul_lhs) { - errno = ERANGE; - } - rhs = -rhs; - } - return lhs * mul_lhs + rhs; -} - static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs, uint64_t mul_lhs) { if (lhs > (UINT_MAX - rhs) / mul_lhs) { @@ -1959,7 +1937,16 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int64_t next_part = strtoll(endptr, &new_end, 10); ptrdiff_t digits = new_end - endptr; int64_t mul_result = power_int(10, (int)digits); - result = add_int_check_overflow(result, next_part, mul_result); + // result * mul_result + if (checked_int64_mul(result, mul_result, &result)) { + // overflow + errno = ERANGE; + } + // result + next_part + if (checked_int64_add(result, next_part, &result)) { + // overflow + errno = ERANGE; + } endptr = new_end; } From 0ef47a7b3acdc582a1ee507cab7b035b5d625e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 10:30:37 -0300 Subject: [PATCH 17/65] fix: handle negative check --- pandas/_libs/src/parser/tokenizer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4dac197596af5..2d7fb19aa8800 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1926,6 +1926,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; char *endptr = NULL; int64_t result = strtoll(p_item, &endptr, 10); + bool is_negative = result < 0; while (errno == 0 && tsep != '\0' && *endptr == tsep) { // Skip multiple consecutive tsep @@ -1935,6 +1936,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *new_end = NULL; int64_t next_part = strtoll(endptr, &new_end, 10); + if (is_negative) { + next_part = -next_part; + } + ptrdiff_t digits = new_end - endptr; int64_t mul_result = power_int(10, (int)digits); // result * mul_result From c840ef01037809aba092556e08a8211113c0ac53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 10:34:39 -0300 Subject: [PATCH 18/65] fix: add test for thousand separator with negative number --- pandas/tests/io/parser/common/test_common_basic.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3680273f5e98a..487520b7a9359 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -72,13 +72,16 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) -def test_1000_sep(all_parsers): +@pytest.mark.parametrize( + "number_csv, expected_number", [("2,334", 2334), ("-2,334", -2334)] +) +def test_1000_sep(all_parsers, number_csv, expected_number): parser = all_parsers - data = """A|B|C -1|2,334|5 + data = f"""A|B|C +1|{number_csv}|5 10|13|10. """ - expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]}) if parser.engine == "pyarrow": msg = "The 'thousands' option is not supported with the 'pyarrow' engine" From 132342b1f73b2956ffb9ab59826c78b436d693a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 10:37:32 -0300 Subject: [PATCH 19/65] move to portable --- pandas/_libs/include/pandas/portable.h | 29 +++++++++++++++++++ .../vendored/numpy/datetime/np_datetime.h | 29 ------------------- pandas/_libs/src/parser/tokenizer.c | 1 - .../src/vendored/numpy/datetime/np_datetime.c | 1 + 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index 1d0509d9e9724..f9f1e96d7dc7a 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -35,3 +35,32 @@ The full license is in the LICENSE file, distributed with this software. do { \ } while (0) /* fallthrough */ #endif + +#if defined(_WIN32) +#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS +#define ENABLE_INTSAFE_SIGNED_FUNCTIONS +#endif +#include +#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) +#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) +#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#else +#if defined __has_builtin +#if __has_builtin(__builtin_add_overflow) +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, + "Overflow checking not detected; please try a newer compiler"); +#endif +// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment +// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that +#elif __GNUC__ > 7 +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); +#endif +#endif diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h index 6ae24bf58cf24..e4e90a7ea24cf 100644 --- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h +++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h @@ -20,35 +20,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API -#if defined(_WIN32) -#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS -#define ENABLE_INTSAFE_SIGNED_FUNCTIONS -#endif -#include -#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) -#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) -#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) -#else -#if defined __has_builtin -#if __has_builtin(__builtin_add_overflow) -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, - "Overflow checking not detected; please try a newer compiler"); -#endif -// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment -// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that -#elif __GNUC__ > 7 -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); -#endif -#endif - #include typedef struct { diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2d7fb19aa8800..4884964fef0f2 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -18,7 +18,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these. */ #include "pandas/parser/tokenizer.h" #include "pandas/portable.h" -#include "pandas/vendored/numpy/datetime/np_datetime.h" #include #include diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index eab58e915e247..043fa033df272 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -21,6 +21,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include "pandas/vendored/numpy/datetime/np_datetime.h" +#include "pandas/portable.h" #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY #include From e6977cc64033629fb6389e59db417385e226a377 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 10:46:45 -0300 Subject: [PATCH 20/65] perf: use builtin unsigned long overflow check --- pandas/_libs/include/pandas/portable.h | 6 ++++++ pandas/_libs/src/parser/tokenizer.c | 21 ++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index f9f1e96d7dc7a..af5a821f890fd 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -44,12 +44,18 @@ The full license is in the LICENSE file, distributed with this software. #define checked_int64_add(a, b, res) LongLongAdd(a, b, res) #define checked_int64_sub(a, b, res) LongLongSub(a, b, res) #define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#define checked_uint64_add(a, b, res) ULongLongAdd(a, b, res) +#define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res) +#define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res) #else #if defined __has_builtin #if __has_builtin(__builtin_add_overflow) #define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) #define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) #define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#define checked_uint64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_uint64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_uint64_mul(a, b, res) __builtin_mul_overflow(a, b, res) #else _Static_assert(0, "Overflow checking not detected; please try a newer compiler"); diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4884964fef0f2..812b396f12344 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1898,14 +1898,6 @@ static int power_int(int base, int exponent) { return result * base; } -static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs, - uint64_t mul_lhs) { - if (lhs > (UINT_MAX - rhs) / mul_lhs) { - errno = ERANGE; - } - return lhs * mul_lhs + rhs; -} - int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { if (!p_item || *p_item == '\0') { @@ -2008,7 +2000,18 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t next_part = strtoull(endptr, &new_end, 10); ptrdiff_t digits = new_end - endptr; uint64_t mul_result = power_int(10, (int)digits); - result = add_uint_check_overflow(result, next_part, mul_result); + + // result * mul_result + if (checked_uint64_mul(result, mul_result, &result)) { + // overflow + errno = ERANGE; + } + // result + next_part + if (checked_uint64_add(result, next_part, &result)) { + // overflow + errno = ERANGE; + } + endptr = new_end; } From 8120eea51c18ecef546bd1d58f6992cedd6dbd74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:04:16 -0300 Subject: [PATCH 21/65] refactor: combine builting and gnuc branches --- pandas/_libs/include/pandas/portable.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index af5a821f890fd..b462f1e8ca0e6 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -48,8 +48,8 @@ The full license is in the LICENSE file, distributed with this software. #define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res) #define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res) #else -#if defined __has_builtin -#if __has_builtin(__builtin_add_overflow) +#if (defined __has_builtin && __has_builtin(__builtin_add_overflow)) || \ + __GNUC__ > 7 #define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) #define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) #define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) @@ -60,13 +60,4 @@ The full license is in the LICENSE file, distributed with this software. _Static_assert(0, "Overflow checking not detected; please try a newer compiler"); #endif -// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment -// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that -#elif __GNUC__ > 7 -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); -#endif #endif From 1f5d506c9a1bb5baa1d685c951ab658c6504edfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:19:30 -0300 Subject: [PATCH 22/65] don't assign null --- pandas/_libs/src/parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 812b396f12344..2ff5fb3b4116c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1925,7 +1925,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, endptr++; } - char *new_end = NULL; + char *new_end; int64_t next_part = strtoll(endptr, &new_end, 10); if (is_negative) { next_part = -next_part; @@ -1996,7 +1996,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, endptr++; } - char *new_end = NULL; + char *new_end; uint64_t next_part = strtoull(endptr, &new_end, 10); ptrdiff_t digits = new_end - endptr; uint64_t mul_result = power_int(10, (int)digits); From 479a2abeccc0f7a870a539baa09caeff05c5879d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:27:16 -0300 Subject: [PATCH 23/65] fix: perform bound check --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2ff5fb3b4116c..72b7fa8bb9154 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1863,7 +1863,7 @@ static inline bool has_digit_int(const char *str) { return true; case '+': case '-': - return isdigit_ascii(str[1]); + return str[1] != '\0' && isdigit_ascii(str[1]); default: return false; } From c37c35500213c539e5baeb4a77f14fe8b3f8acac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:44:41 -0300 Subject: [PATCH 24/65] fix: assign error if doesn't have a digit after tsep It stills permits ending in tsep --- pandas/_libs/src/parser/tokenizer.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 72b7fa8bb9154..5508429e78f05 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1920,9 +1920,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, bool is_negative = result < 0; while (errno == 0 && tsep != '\0' && *endptr == tsep) { - // Skip multiple consecutive tsep - while (*endptr == tsep) { - endptr++; + // move after tsep + endptr++; + if (*endptr == '\0' || !isdigit_ascii(*endptr)) { + // stop parsing and let the remaining of the function + // assign an error code + break; } char *new_end; @@ -1991,9 +1994,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t result = strtoull(p_item, &endptr, 10); while (errno == 0 && tsep != '\0' && *endptr == tsep) { - // Skip multiple consecutive tsep - while (*endptr == tsep) { - endptr++; + // move after tsep + endptr++; + if (*endptr == '\0' || !isdigit_ascii(*endptr)) { + // stop parsing and let the remaining of the function + // assign an error code + break; } char *new_end; From 7d552833f30e9c6026c44099b90bfd6c57653b9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:51:24 -0300 Subject: [PATCH 25/65] fix: go back to buffer solution --- pandas/_libs/src/parser/tokenizer.c | 134 ++++++++++++---------------- 1 file changed, 56 insertions(+), 78 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 5508429e78f05..d947cffe7e38c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -21,15 +21,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include -#include #include #include -#include #include #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 +#define PROCESSED_WORD_CAPACITY 128 + void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { // column i, starting at 0 @@ -1876,26 +1876,39 @@ static inline bool has_only_spaces(const char *str) { return *str == '\0'; } -static int power_int(int base, int exponent) { - // https://en.wikipedia.org/wiki/Exponentiation_by_squaring - if (exponent == 0) { - return 1; - } else if (exponent < 0) { - return 0; - } - - int result = 1; +/* Copy a string without `char_to_remove` into `output`, + * it assumes that output is filled with `\0`, + * so it won't null terminate the result. + */ +static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], + const char *str, char char_to_remove) { + char *dst = output; + const char *src = str; + // last character is reserved for null terminator. + const char *end = output + PROCESSED_WORD_CAPACITY - 1; + + while (*src != '\0' && dst < end) { + const char *next = src; + // find EOS or char_to_remove + while (*next != '\0' && *next != char_to_remove) { + next++; + } - while (exponent > 1) { - if (exponent % 2 == 1) { - result *= base; - exponent--; + size_t len = next - src; + if (dst + len > end) { + // Can't write here, str is too big + errno = ERANGE; + return; } - result *= result; - exponent /= 2; - } - return result * base; + // copy block + memcpy(dst, src, len); + + // go to next available location to write + dst += len; + // Move past char to remove + src = *next == char_to_remove ? next + 1 : next; + } } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, @@ -1915,40 +1928,21 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } errno = 0; - char *endptr = NULL; - int64_t result = strtoll(p_item, &endptr, 10); - bool is_negative = result < 0; - - while (errno == 0 && tsep != '\0' && *endptr == tsep) { - // move after tsep - endptr++; - if (*endptr == '\0' || !isdigit_ascii(*endptr)) { - // stop parsing and let the remaining of the function - // assign an error code - break; - } - - char *new_end; - int64_t next_part = strtoll(endptr, &new_end, 10); - if (is_negative) { - next_part = -next_part; - } + if (tsep != '\0' && strchr(p_item, tsep) != NULL) { + char buffer[PROCESSED_WORD_CAPACITY]; + memset(buffer, '\0', sizeof(buffer)); + copy_string_without_char(buffer, p_item, tsep); + p_item = buffer; + } - ptrdiff_t digits = new_end - endptr; - int64_t mul_result = power_int(10, (int)digits); - // result * mul_result - if (checked_int64_mul(result, mul_result, &result)) { - // overflow - errno = ERANGE; - } - // result + next_part - if (checked_int64_add(result, next_part, &result)) { - // overflow - errno = ERANGE; - } - endptr = new_end; + if (errno == ERANGE) { + *error = ERROR_OVERFLOW; + return 0; } + char *endptr = NULL; + int64_t result = strtoll(p_item, &endptr, 10); + if (!has_only_spaces(endptr)) { // Check first for invalid characters because we may // want to skip integer parsing if we find one. @@ -1990,37 +1984,21 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } errno = 0; - char *endptr = NULL; - uint64_t result = strtoull(p_item, &endptr, 10); - - while (errno == 0 && tsep != '\0' && *endptr == tsep) { - // move after tsep - endptr++; - if (*endptr == '\0' || !isdigit_ascii(*endptr)) { - // stop parsing and let the remaining of the function - // assign an error code - break; - } - - char *new_end; - uint64_t next_part = strtoull(endptr, &new_end, 10); - ptrdiff_t digits = new_end - endptr; - uint64_t mul_result = power_int(10, (int)digits); - - // result * mul_result - if (checked_uint64_mul(result, mul_result, &result)) { - // overflow - errno = ERANGE; - } - // result + next_part - if (checked_uint64_add(result, next_part, &result)) { - // overflow - errno = ERANGE; - } + if (tsep != '\0' && strchr(p_item, tsep) != NULL) { + char buffer[PROCESSED_WORD_CAPACITY]; + memset(buffer, '\0', sizeof(buffer)); + copy_string_without_char(buffer, p_item, tsep); + p_item = buffer; + } - endptr = new_end; + if (errno == ERANGE) { + *error = ERROR_OVERFLOW; + return 0; } + char *endptr = NULL; + uint64_t result = strtoull(p_item, &endptr, 10); + if (!has_only_spaces(endptr)) { *error = ERROR_INVALID_CHARS; result = 0; From ffe50cee1028a87f2061135e506baabe2268b4e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 11:51:50 -0300 Subject: [PATCH 26/65] refactor: undo refactor in np_datetime.c --- pandas/_libs/include/pandas/portable.h | 26 ---------------- .../src/vendored/numpy/datetime/np_datetime.c | 30 ++++++++++++++++++- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h index b462f1e8ca0e6..1d0509d9e9724 100644 --- a/pandas/_libs/include/pandas/portable.h +++ b/pandas/_libs/include/pandas/portable.h @@ -35,29 +35,3 @@ The full license is in the LICENSE file, distributed with this software. do { \ } while (0) /* fallthrough */ #endif - -#if defined(_WIN32) -#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS -#define ENABLE_INTSAFE_SIGNED_FUNCTIONS -#endif -#include -#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) -#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) -#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) -#define checked_uint64_add(a, b, res) ULongLongAdd(a, b, res) -#define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res) -#define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res) -#else -#if (defined __has_builtin && __has_builtin(__builtin_add_overflow)) || \ - __GNUC__ > 7 -#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#define checked_uint64_add(a, b, res) __builtin_add_overflow(a, b, res) -#define checked_uint64_sub(a, b, res) __builtin_sub_overflow(a, b, res) -#define checked_uint64_mul(a, b, res) __builtin_mul_overflow(a, b, res) -#else -_Static_assert(0, - "Overflow checking not detected; please try a newer compiler"); -#endif -#endif diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index 043fa033df272..9a022095feee9 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -21,13 +21,41 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #endif // NPY_NO_DEPRECATED_API #include "pandas/vendored/numpy/datetime/np_datetime.h" -#include "pandas/portable.h" #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY #include #include #include +#if defined(_WIN32) +#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS +#define ENABLE_INTSAFE_SIGNED_FUNCTIONS +#endif +#include +#define checked_int64_add(a, b, res) LongLongAdd(a, b, res) +#define checked_int64_sub(a, b, res) LongLongSub(a, b, res) +#define checked_int64_mul(a, b, res) LongLongMult(a, b, res) +#else +#if defined __has_builtin +#if __has_builtin(__builtin_add_overflow) +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, + "Overflow checking not detected; please try a newer compiler"); +#endif +// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment +// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that +#elif __GNUC__ > 7 +#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res) +#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res) +#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res) +#else +_Static_assert(0, "__has_builtin not detected; please try a newer compiler"); +#endif +#endif + #define XSTR(a) STR(a) #define STR(a) #a From af5ad7120c5271ed0e50d1e60d9ff8478341cc1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 12:08:39 -0300 Subject: [PATCH 27/65] fix: fix undefined behavior --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index d947cffe7e38c..28d901f5f5a6a 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1928,8 +1928,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } errno = 0; + char buffer[PROCESSED_WORD_CAPACITY]; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - char buffer[PROCESSED_WORD_CAPACITY]; memset(buffer, '\0', sizeof(buffer)); copy_string_without_char(buffer, p_item, tsep); p_item = buffer; From 92117041c200fe33f446e7bbc77c63d034e0a757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 12:55:36 -0300 Subject: [PATCH 28/65] fix: fix leftover undefined behaviour --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 28d901f5f5a6a..514482055c64d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1984,8 +1984,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } errno = 0; + char buffer[PROCESSED_WORD_CAPACITY]; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - char buffer[PROCESSED_WORD_CAPACITY]; memset(buffer, '\0', sizeof(buffer)); copy_string_without_char(buffer, p_item, tsep); p_item = buffer; From d026b016d4e550c70f378671212ce8f29d7e8a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 13:26:22 -0300 Subject: [PATCH 29/65] rewrite `copy_string_without_char` - Returns status code - Simplify loop for a more common sliding window - use string length --- .../_libs/include/pandas/parser/tokenizer.h | 1 + pandas/_libs/src/parser/tokenizer.c | 81 +++++++++++-------- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 209f375a5bf6c..141d883220f1e 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,6 +17,7 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 +#define ERROR_WORD2BIG 4 #include diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 514482055c64d..e416a334a244b 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1876,39 +1876,48 @@ static inline bool has_only_spaces(const char *str) { return *str == '\0'; } -/* Copy a string without `char_to_remove` into `output`, - * it assumes that output is filled with `\0`, - * so it won't null terminate the result. +/* Copy a string without `char_to_remove` into `output`. */ -static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], - const char *str, char char_to_remove) { - char *dst = output; - const char *src = str; +static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], + const char *str, size_t str_len, + char char_to_remove) { // last character is reserved for null terminator. - const char *end = output + PROCESSED_WORD_CAPACITY - 1; - - while (*src != '\0' && dst < end) { - const char *next = src; - // find EOS or char_to_remove - while (*next != '\0' && *next != char_to_remove) { - next++; + size_t max_str_size = PROCESSED_WORD_CAPACITY - 1; + if (str_len > max_str_size) { + // str_len is too big. + // Check if it's possible to write after removing all `char_to_remove`. + size_t count_char_to_remove = 0; + for (const char *src = str; *src != '\0'; src++) { + if (*src == char_to_remove) { + count_char_to_remove++; + } } - size_t len = next - src; - if (dst + len > end) { - // Can't write here, str is too big - errno = ERANGE; - return; + if (str_len - count_char_to_remove > max_str_size) { + return ERROR_WORD2BIG; } + } + + char *dst = output; + const char *left = str; + + // sliding window + for (const char *right = str; *left != '\0'; right++) { + if (*right == '\0' || *right == char_to_remove) { + size_t len = right - left; - // copy block - memcpy(dst, src, len); + // copy block + memcpy(dst, left, len); - // go to next available location to write - dst += len; - // Move past char to remove - src = *next == char_to_remove ? next + 1 : next; + // go to next available location to write + dst += len; + left = *right == '\0' ? right : right + 1; + } } + + // null terminate + *dst = '\0'; + return 0; } int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, @@ -1930,8 +1939,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - memset(buffer, '\0', sizeof(buffer)); - copy_string_without_char(buffer, p_item, tsep); + int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); + + if (status != 0) { + *error = status; + return 0; + } + p_item = buffer; } @@ -1986,14 +2000,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - memset(buffer, '\0', sizeof(buffer)); - copy_string_without_char(buffer, p_item, tsep); - p_item = buffer; - } + int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); - if (errno == ERANGE) { - *error = ERROR_OVERFLOW; - return 0; + if (status != 0) { + *error = status; + return 0; + } + p_item = buffer; } char *endptr = NULL; From d76ff5f01bcda2cc3f6949a52dda40fb17fe0255 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 17:19:50 -0300 Subject: [PATCH 30/65] fix: change solution to safe guard against end_ptr --- pandas/_libs/src/parser/tokenizer.c | 53 ++++++++++++++--------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index e416a334a244b..a1457e2e385c7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1881,42 +1881,41 @@ static inline bool has_only_spaces(const char *str) { static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *str, size_t str_len, char char_to_remove) { - // last character is reserved for null terminator. - size_t max_str_size = PROCESSED_WORD_CAPACITY - 1; - if (str_len > max_str_size) { - // str_len is too big. - // Check if it's possible to write after removing all `char_to_remove`. - size_t count_char_to_remove = 0; - for (const char *src = str; *src != '\0'; src++) { - if (*src == char_to_remove) { - count_char_to_remove++; - } - } + const char *left = str; + const char *right; + const char *end_ptr = str + str_len; + size_t bytes_read = 0; - if (str_len - count_char_to_remove > max_str_size) { + while ((right = memchr(left, char_to_remove, end_ptr - left)) != NULL) { + size_t nbytes = right - left; + + // check if we have enough space, including the null terminator. + if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { return ERROR_WORD2BIG; } - } - - char *dst = output; - const char *left = str; - - // sliding window - for (const char *right = str; *left != '\0'; right++) { - if (*right == '\0' || *right == char_to_remove) { - size_t len = right - left; + // copy block + memcpy(&output[bytes_read], left, nbytes); + bytes_read += nbytes; + left = right + 1; - // copy block - memcpy(dst, left, len); + // Exit after processing the entire string + if (left >= end_ptr) { + break; + } + } - // go to next available location to write - dst += len; - left = *right == '\0' ? right : right + 1; + // copy final chunk that doesn't contain char_to_remove + if (end_ptr > left) { + size_t nbytes = nbytes = end_ptr - left; + if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { + return ERROR_WORD2BIG; } + memcpy(&output[bytes_read], left, nbytes); + bytes_read += nbytes; } // null terminate - *dst = '\0'; + output[bytes_read] = '\0'; return 0; } From b523a19a2c9758d4af3e95f904289fe2b9338096 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 17:20:19 -0300 Subject: [PATCH 31/65] test: add some edge cases tests with thousand separator --- pandas/tests/io/parser/common/test_common_basic.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 487520b7a9359..f38844e167222 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -73,7 +73,14 @@ def test_read_csv_local(all_parsers, csv1): @pytest.mark.parametrize( - "number_csv, expected_number", [("2,334", 2334), ("-2,334", -2334)] + "number_csv, expected_number", + [ + ("2,334", 2334), + ("-2,334", -2334), + ("-2,334,", -2334), + ("2,,,,,,,,,,,,,,,5", 25), + ("2,,3,4,,,,,,,,,,,,5", 2345), + ], ) def test_1000_sep(all_parsers, number_csv, expected_number): parser = all_parsers From 6265172aa08ffed4b27e27794875f7e74bb8e87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 18:17:41 -0300 Subject: [PATCH 32/65] Update pandas/_libs/src/parser/tokenizer.c Co-authored-by: William Ayd --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a1457e2e385c7..74e6a5cbc6d51 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1886,7 +1886,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *end_ptr = str + str_len; size_t bytes_read = 0; - while ((right = memchr(left, char_to_remove, end_ptr - left)) != NULL) { + while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) { size_t nbytes = right - left; // check if we have enough space, including the null terminator. From abed6c1c7ec038cfd692b3c2cd2eba49643f82a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 18:19:00 -0300 Subject: [PATCH 33/65] fix: error to -1 --- pandas/_libs/include/pandas/parser/tokenizer.h | 1 - pandas/_libs/src/parser/tokenizer.c | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h index 141d883220f1e..209f375a5bf6c 100644 --- a/pandas/_libs/include/pandas/parser/tokenizer.h +++ b/pandas/_libs/include/pandas/parser/tokenizer.h @@ -17,7 +17,6 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#define ERROR_WORD2BIG 4 #include diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 74e6a5cbc6d51..8970fe1871248 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1891,7 +1891,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], // check if we have enough space, including the null terminator. if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { - return ERROR_WORD2BIG; + return -1; } // copy block memcpy(&output[bytes_read], left, nbytes); @@ -1908,7 +1908,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], if (end_ptr > left) { size_t nbytes = nbytes = end_ptr - left; if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { - return ERROR_WORD2BIG; + return -1; } memcpy(&output[bytes_read], left, nbytes); bytes_read += nbytes; @@ -1941,7 +1941,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); if (status != 0) { - *error = status; + // Word is too big, probably will cause an overflow + *error = ERROR_OVERFLOW; return 0; } From 8616f9ffe1967a8f58a11ac295673aff3a8cf415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 19:12:46 -0300 Subject: [PATCH 34/65] fix: leftover status check --- pandas/_libs/src/parser/tokenizer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8970fe1871248..fa7a1c755e3b9 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -2003,7 +2003,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); if (status != 0) { - *error = status; + // Word is too big, probably will cause an overflow + *error = ERROR_OVERFLOW; return 0; } p_item = buffer; From c4e0e25758c45dd4393effdda6f6af39d8b36071 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Tue, 14 Oct 2025 19:45:27 -0300 Subject: [PATCH 35/65] fix: remove duplicate nbytes declaration --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index fa7a1c755e3b9..0fdde3a6ecc25 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1906,7 +1906,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], // copy final chunk that doesn't contain char_to_remove if (end_ptr > left) { - size_t nbytes = nbytes = end_ptr - left; + size_t nbytes = end_ptr - left; if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { return -1; } From 0b192085fa46b49bd159cc7800e766cfca979b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 08:58:18 -0300 Subject: [PATCH 36/65] fix: use memchr to find if need to process the word --- pandas/_libs/src/parser/tokenizer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0fdde3a6ecc25..70e0096f788b8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1937,8 +1937,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; - if (tsep != '\0' && strchr(p_item, tsep) != NULL) { - int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); + size_t str_len = strlen(p_item); + if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { + int status = copy_string_without_char(buffer, p_item, str_len, tsep); if (status != 0) { // Word is too big, probably will cause an overflow @@ -1999,7 +2000,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; - if (tsep != '\0' && strchr(p_item, tsep) != NULL) { + size_t str_len = strlen(p_item); + if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); if (status != 0) { From 29d74f7396739735f4bc57794131d085ede53f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 09:02:47 -0300 Subject: [PATCH 37/65] chore: add comment explaining why 128 bytes for capacity --- pandas/_libs/src/parser/tokenizer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 70e0096f788b8..b0ac55768295f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -28,6 +28,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 +// Arrow256 allows up to 76 decimal digits. +// We rounded up to the next power of 2. #define PROCESSED_WORD_CAPACITY 128 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, From b5b8f3be4fa9f7e681f604f36662c9f9a51a20dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 10:49:38 -0300 Subject: [PATCH 38/65] rename bytes_read to bytes_written --- pandas/_libs/src/parser/tokenizer.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index b0ac55768295f..b3fadfa0a3e27 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1886,18 +1886,19 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *left = str; const char *right; const char *end_ptr = str + str_len; - size_t bytes_read = 0; + size_t bytes_written = 0; - while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) { + while ((right = memchr(left, char_to_remove, str_len - bytes_written)) != + NULL) { size_t nbytes = right - left; // check if we have enough space, including the null terminator. - if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { + if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) { return -1; } // copy block - memcpy(&output[bytes_read], left, nbytes); - bytes_read += nbytes; + memcpy(&output[bytes_written], left, nbytes); + bytes_written += nbytes; left = right + 1; // Exit after processing the entire string @@ -1909,15 +1910,15 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], // copy final chunk that doesn't contain char_to_remove if (end_ptr > left) { size_t nbytes = end_ptr - left; - if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { + if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) { return -1; } - memcpy(&output[bytes_read], left, nbytes); - bytes_read += nbytes; + memcpy(&output[bytes_written], left, nbytes); + bytes_written += nbytes; } // null terminate - output[bytes_read] = '\0'; + output[bytes_written] = '\0'; return 0; } From 803a8bfb7db3477f5e3226c0d05002ebb85401d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 11:17:25 -0300 Subject: [PATCH 39/65] chore: move errno assignment --- pandas/_libs/src/parser/tokenizer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index b3fadfa0a3e27..a0f339ef112c0 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1938,7 +1938,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, return 0; } - errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; size_t str_len = strlen(p_item); if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { @@ -1959,6 +1958,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } char *endptr = NULL; + // strtoll sets errno if it finds an overflow. + // It's value is reset to don't pollute the verification below. + errno = 0; int64_t result = strtoll(p_item, &endptr, 10); if (!has_only_spaces(endptr)) { @@ -2001,7 +2003,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - errno = 0; char buffer[PROCESSED_WORD_CAPACITY]; size_t str_len = strlen(p_item); if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { @@ -2016,6 +2017,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } char *endptr = NULL; + // strtoull sets errno if it finds an overflow. + // It's value is reset to don't pollute the verification below. + errno = 0; uint64_t result = strtoull(p_item, &endptr, 10); if (!has_only_spaces(endptr)) { From 31f26cf0d13b720b2a489442c17763e37da58d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 11:52:49 -0300 Subject: [PATCH 40/65] fix: fix error logic by comparing pointers --- pandas/_libs/src/parser/tokenizer.c | 31 ++++++++++------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a0f339ef112c0..52e1105138549 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1888,33 +1888,22 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *end_ptr = str + str_len; size_t bytes_written = 0; - while ((right = memchr(left, char_to_remove, str_len - bytes_written)) != - NULL) { - size_t nbytes = right - left; + while (left < end_ptr) { + right = memchr(left, char_to_remove, str_len - bytes_written); + + // If it doesn't find the char to remove, just copy until EOS. + size_t chunk_size = right ? right - left : end_ptr - left; // check if we have enough space, including the null terminator. - if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) { + if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { return -1; } // copy block - memcpy(&output[bytes_written], left, nbytes); - bytes_written += nbytes; - left = right + 1; + memcpy(&output[bytes_written], left, chunk_size); + bytes_written += chunk_size; - // Exit after processing the entire string - if (left >= end_ptr) { - break; - } - } - - // copy final chunk that doesn't contain char_to_remove - if (end_ptr > left) { - size_t nbytes = end_ptr - left; - if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) { - return -1; - } - memcpy(&output[bytes_written], left, nbytes); - bytes_written += nbytes; + // Advance past the removed character if we found it. + left = right ? right + 1 : end_ptr; } // null terminate From 171b553deb41f590cd219b32ae01e9417b809ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 12:45:00 -0300 Subject: [PATCH 41/65] fix: use pointers --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 52e1105138549..a2295489af109 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1889,7 +1889,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t bytes_written = 0; while (left < end_ptr) { - right = memchr(left, char_to_remove, str_len - bytes_written); + right = memchr(left, char_to_remove, end_ptr - left); // If it doesn't find the char to remove, just copy until EOS. size_t chunk_size = right ? right - left : end_ptr - left; From 30f6bdc4cab5319d8a2b2c0e86eaf99df3325b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 13:26:10 -0300 Subject: [PATCH 42/65] fix: keep track on how many bytes to read --- pandas/_libs/src/parser/tokenizer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a2295489af109..dbbed27c1ee82 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1887,9 +1887,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *right; const char *end_ptr = str + str_len; size_t bytes_written = 0; + size_t remaining_bytes_to_read = str_len; - while (left < end_ptr) { - right = memchr(left, char_to_remove, end_ptr - left); + while (remaining_bytes_to_read > 0) { + right = memchr(left, char_to_remove, remaining_bytes_to_read); // If it doesn't find the char to remove, just copy until EOS. size_t chunk_size = right ? right - left : end_ptr - left; @@ -1904,6 +1905,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], // Advance past the removed character if we found it. left = right ? right + 1 : end_ptr; + remaining_bytes_to_read -= right ? chunk_size + 1 : chunk_size; } // null terminate From 554675bc990418543aeaca80af797e1116b71e4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 13:28:35 -0300 Subject: [PATCH 43/65] chore: cast to size_t --- pandas/_libs/src/parser/tokenizer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index dbbed27c1ee82..373ab0b9e70fe 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1893,7 +1893,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], right = memchr(left, char_to_remove, remaining_bytes_to_read); // If it doesn't find the char to remove, just copy until EOS. - size_t chunk_size = right ? right - left : end_ptr - left; + // We are also casting directly to size_t because + // `left` never goes beyond `right` or `end_ptr`. + size_t chunk_size = + right ? (size_t)(right - left) : (size_t)(end_ptr - left); // check if we have enough space, including the null terminator. if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { From 7ea445425926931c61f8212ef04eb72a957d66b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 13:30:44 -0300 Subject: [PATCH 44/65] fix: cast pointer to fix Wc++-compat warning --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 373ab0b9e70fe..17db4e3b433c6 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1890,7 +1890,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t remaining_bytes_to_read = str_len; while (remaining_bytes_to_read > 0) { - right = memchr(left, char_to_remove, remaining_bytes_to_read); + right = (const char *)memchr(left, char_to_remove, remaining_bytes_to_read); // If it doesn't find the char to remove, just copy until EOS. // We are also casting directly to size_t because From 82dc037e4921be4725e35c24653fe04247ed05d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 15:09:18 -0300 Subject: [PATCH 45/65] fix: remove casts for -Weverything --- pandas/_libs/src/parser/tokenizer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 17db4e3b433c6..dbbed27c1ee82 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1890,13 +1890,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t remaining_bytes_to_read = str_len; while (remaining_bytes_to_read > 0) { - right = (const char *)memchr(left, char_to_remove, remaining_bytes_to_read); + right = memchr(left, char_to_remove, remaining_bytes_to_read); // If it doesn't find the char to remove, just copy until EOS. - // We are also casting directly to size_t because - // `left` never goes beyond `right` or `end_ptr`. - size_t chunk_size = - right ? (size_t)(right - left) : (size_t)(end_ptr - left); + size_t chunk_size = right ? right - left : end_ptr - left; // check if we have enough space, including the null terminator. if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { From 627df4cc97c1ed352c5146be21179208bddbd222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 15:37:05 -0300 Subject: [PATCH 46/65] fix: move remaining_bytes_to_read to the start of loop --- pandas/_libs/src/parser/tokenizer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index dbbed27c1ee82..3efe4a5b3d865 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1887,9 +1887,9 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *right; const char *end_ptr = str + str_len; size_t bytes_written = 0; - size_t remaining_bytes_to_read = str_len; - while (remaining_bytes_to_read > 0) { + while (left < end_ptr) { + size_t remaining_bytes_to_read = end_ptr - left; right = memchr(left, char_to_remove, remaining_bytes_to_read); // If it doesn't find the char to remove, just copy until EOS. @@ -1905,7 +1905,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], // Advance past the removed character if we found it. left = right ? right + 1 : end_ptr; - remaining_bytes_to_read -= right ? chunk_size + 1 : chunk_size; } // null terminate From 1b3eba0fde5bc06937a4eb6766792216b5645212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 15:48:00 -0300 Subject: [PATCH 47/65] fix: consolidate it even further --- pandas/_libs/src/parser/tokenizer.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 3efe4a5b3d865..716154f3a06c7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1892,8 +1892,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t remaining_bytes_to_read = end_ptr - left; right = memchr(left, char_to_remove, remaining_bytes_to_read); - // If it doesn't find the char to remove, just copy until EOS. - size_t chunk_size = right ? right - left : end_ptr - left; + if (!right) { + // If it doesn't find the char to remove, just copy until EOS. + right = end_ptr; + } + + size_t chunk_size = right - left; // check if we have enough space, including the null terminator. if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { @@ -1904,7 +1908,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], bytes_written += chunk_size; // Advance past the removed character if we found it. - left = right ? right + 1 : end_ptr; + left = right + 1; } // null terminate From e1667faab453bdc5538b65e1565113266da0349a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 16:55:40 -0300 Subject: [PATCH 48/65] fix: move right definition --- pandas/_libs/src/parser/tokenizer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 716154f3a06c7..db816f5001588 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1884,13 +1884,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *str, size_t str_len, char char_to_remove) { const char *left = str; - const char *right; const char *end_ptr = str + str_len; size_t bytes_written = 0; while (left < end_ptr) { size_t remaining_bytes_to_read = end_ptr - left; - right = memchr(left, char_to_remove, remaining_bytes_to_read); + const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); if (!right) { // If it doesn't find the char to remove, just copy until EOS. From bee776a41056658a4c62a57258213521c584e671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 16:55:51 -0300 Subject: [PATCH 49/65] chore: remove superfluous comments --- pandas/_libs/src/parser/tokenizer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index db816f5001588..065139d22bda5 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1898,15 +1898,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t chunk_size = right - left; - // check if we have enough space, including the null terminator. if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { return -1; } - // copy block memcpy(&output[bytes_written], left, chunk_size); bytes_written += chunk_size; - // Advance past the removed character if we found it. left = right + 1; } From d6933450a0d766ede4dd4297fec63bc4faf986c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 16:58:54 -0300 Subject: [PATCH 50/65] fix: add const qualifier --- pandas/_libs/src/parser/tokenizer.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 065139d22bda5..7c0df5abf144d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1888,7 +1888,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], size_t bytes_written = 0; while (left < end_ptr) { - size_t remaining_bytes_to_read = end_ptr - left; + const size_t remaining_bytes_to_read = end_ptr - left; const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); if (!right) { @@ -1896,7 +1896,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], right = end_ptr; } - size_t chunk_size = right - left; + const size_t chunk_size = right - left; if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { return -1; @@ -1929,9 +1929,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } char buffer[PROCESSED_WORD_CAPACITY]; - size_t str_len = strlen(p_item); + const size_t str_len = strlen(p_item); if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { - int status = copy_string_without_char(buffer, p_item, str_len, tsep); + const int status = copy_string_without_char(buffer, p_item, str_len, tsep); if (status != 0) { // Word is too big, probably will cause an overflow @@ -1994,9 +1994,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } char buffer[PROCESSED_WORD_CAPACITY]; - size_t str_len = strlen(p_item); + const size_t str_len = strlen(p_item); if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { - int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); + const int status = + copy_string_without_char(buffer, p_item, strlen(p_item), tsep); if (status != 0) { // Word is too big, probably will cause an overflow From 593c614d7ac79c7cb989b83abc76e6fda6e7706e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 17:06:15 -0300 Subject: [PATCH 51/65] fix: remove unnecessary NULL and null-byte checks --- pandas/_libs/src/parser/tokenizer.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 7c0df5abf144d..32274c3a727ce 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1847,10 +1847,6 @@ int uint64_conflict(uint_state *self) { * @return Non-zero integer indicating that has a digit 0 otherwise. */ static inline bool has_digit_int(const char *str) { - if (!str || *str == '\0') { - return false; - } - switch (*str) { case '0': case '1': @@ -1914,11 +1910,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { - if (!p_item || *p_item == '\0') { - *error = ERROR_NO_DIGITS; - return 0; - } - while (isspace_ascii(*p_item)) { ++p_item; } @@ -1970,11 +1961,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { - if (!p_item || *p_item == '\0') { - *error = ERROR_NO_DIGITS; - return 0; - } - while (isspace_ascii(*p_item)) { ++p_item; } From ff4d48b672a8641961689466f6e58e337e5959f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 17:09:45 -0300 Subject: [PATCH 52/65] fix: remove unnecessary errno verification --- pandas/_libs/src/parser/tokenizer.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 32274c3a727ce..a05d8f98a6673 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1933,11 +1933,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, p_item = buffer; } - if (errno == ERANGE) { - *error = ERROR_OVERFLOW; - return 0; - } - char *endptr = NULL; // strtoll sets errno if it finds an overflow. // It's value is reset to don't pollute the verification below. From e3a88d31221d2b9e99bb4d34b912b36e1263bb3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 17:15:37 -0300 Subject: [PATCH 53/65] chore: remove NULL assignment --- pandas/_libs/src/parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a05d8f98a6673..65f6877357c14 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1933,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, p_item = buffer; } - char *endptr = NULL; + char *endptr; // strtoll sets errno if it finds an overflow. // It's value is reset to don't pollute the verification below. errno = 0; @@ -1988,7 +1988,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, p_item = buffer; } - char *endptr = NULL; + char *endptr; // strtoull sets errno if it finds an overflow. // It's value is reset to don't pollute the verification below. errno = 0; From b135738d85825f2eff0cdb4cd805e1b50e879bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 17:31:58 -0300 Subject: [PATCH 54/65] fix: don't recompute strlen --- pandas/_libs/src/parser/tokenizer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 65f6877357c14..0609e07362850 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1977,8 +1977,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, char buffer[PROCESSED_WORD_CAPACITY]; const size_t str_len = strlen(p_item); if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { - const int status = - copy_string_without_char(buffer, p_item, strlen(p_item), tsep); + const int status = copy_string_without_char(buffer, p_item, str_len, tsep); if (status != 0) { // Word is too big, probably will cause an overflow From ba8c9b3b4f4ba68cfa23475326796c0f16e6533b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 17:38:54 -0300 Subject: [PATCH 55/65] chore: add some comments back to simplify diff --- pandas/_libs/src/parser/tokenizer.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 0609e07362850..ca48aecf84fc7 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1910,11 +1910,14 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { + // Skip leading spaces. while (isspace_ascii(*p_item)) { ++p_item; } + // Check that there is a first digit. if (!has_digit_int(p_item)) { + // Error... *error = ERROR_NO_DIGITS; return 0; } @@ -1939,6 +1942,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, errno = 0; int64_t result = strtoll(p_item, &endptr, 10); + // Did we use up all the characters? if (!has_only_spaces(endptr)) { // Check first for invalid characters because we may // want to skip integer parsing if we find one. @@ -1956,10 +1960,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { + // Skip leading spaces. while (isspace_ascii(*p_item)) { ++p_item; } + // Handle sign. if (*p_item == '-') { state->seen_sint = 1; *error = 0; @@ -1970,6 +1976,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, // Check that there is a first digit. if (!isdigit_ascii(*p_item)) { + // Error... *error = ERROR_NO_DIGITS; return 0; } @@ -1993,6 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, errno = 0; uint64_t result = strtoull(p_item, &endptr, 10); + // Did we use up all the characters? if (!has_only_spaces(endptr)) { *error = ERROR_INVALID_CHARS; result = 0; From 818921fd3ff49fd8e02ba25de15f4cb7055ad275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 18:14:21 -0300 Subject: [PATCH 56/65] fix: reset errno after handling it --- pandas/_libs/src/parser/tokenizer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index ca48aecf84fc7..a1b0444386fe3 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1937,9 +1937,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } char *endptr; - // strtoll sets errno if it finds an overflow. - // It's value is reset to don't pollute the verification below. - errno = 0; int64_t result = strtoll(p_item, &endptr, 10); // Did we use up all the characters? @@ -1950,6 +1947,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, result = 0; } else if (errno == ERANGE || result > int_max || result < int_min) { *error = ERROR_OVERFLOW; + errno = 0; result = 0; } else { *error = 0; @@ -1995,9 +1993,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } char *endptr; - // strtoull sets errno if it finds an overflow. - // It's value is reset to don't pollute the verification below. - errno = 0; uint64_t result = strtoull(p_item, &endptr, 10); // Did we use up all the characters? @@ -2006,6 +2001,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, result = 0; } else if (errno == ERANGE || result > uint_max) { *error = ERROR_OVERFLOW; + errno = 0; result = 0; } else { *error = 0; From 3e067f76de1997cd1558ef2aa47d06e40c913d54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 18:27:59 -0300 Subject: [PATCH 57/65] fix: put back const char p --- pandas/_libs/src/parser/tokenizer.c | 40 +++++++++++++++-------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a1b0444386fe3..45d73a2f7d39f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1910,22 +1910,23 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { + const char *p = p_item; // Skip leading spaces. - while (isspace_ascii(*p_item)) { - ++p_item; + while (isspace_ascii(*p)) { + ++p; } // Check that there is a first digit. - if (!has_digit_int(p_item)) { + if (!has_digit_int(p)) { // Error... *error = ERROR_NO_DIGITS; return 0; } char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p_item); - if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p_item, str_len, tsep); + const size_t str_len = strlen(p); + if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { + const int status = copy_string_without_char(buffer, p, str_len, tsep); if (status != 0) { // Word is too big, probably will cause an overflow @@ -1933,11 +1934,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, return 0; } - p_item = buffer; + p = buffer; } char *endptr; - int64_t result = strtoll(p_item, &endptr, 10); + int64_t result = strtoll(p, &endptr, 10); // Did we use up all the characters? if (!has_only_spaces(endptr)) { @@ -1958,42 +1959,43 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) { + const char *p = p_item; // Skip leading spaces. - while (isspace_ascii(*p_item)) { - ++p_item; + while (isspace_ascii(*p)) { + ++p; } // Handle sign. - if (*p_item == '-') { + if (*p == '-') { state->seen_sint = 1; *error = 0; return 0; - } else if (*p_item == '+') { - p_item++; + } else if (*p == '+') { + p++; } // Check that there is a first digit. - if (!isdigit_ascii(*p_item)) { + if (!isdigit_ascii(*p)) { // Error... *error = ERROR_NO_DIGITS; return 0; } char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p_item); - if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p_item, str_len, tsep); + const size_t str_len = strlen(p); + if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { + const int status = copy_string_without_char(buffer, p, str_len, tsep); if (status != 0) { // Word is too big, probably will cause an overflow *error = ERROR_OVERFLOW; return 0; } - p_item = buffer; + p = buffer; } char *endptr; - uint64_t result = strtoull(p_item, &endptr, 10); + uint64_t result = strtoull(p, &endptr, 10); // Did we use up all the characters? if (!has_only_spaces(endptr)) { From c0ed83c886b774feacae39cbaab79991423305a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 19:04:03 -0300 Subject: [PATCH 58/65] fix: improve diff for sign handling --- pandas/_libs/src/parser/tokenizer.c | 35 +++++------------------------ 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 45d73a2f7d39f..7f3a632982f54 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1839,34 +1839,6 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -/** - * @brief Check if the character in the pointer indicates a number. - * It expects that you consumed all leading whitespace. - * - * @param p_item Pointer to verify - * @return Non-zero integer indicating that has a digit 0 otherwise. - */ -static inline bool has_digit_int(const char *str) { - switch (*str) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - return true; - case '+': - case '-': - return str[1] != '\0' && isdigit_ascii(str[1]); - default: - return false; - } -} - static inline bool has_only_spaces(const char *str) { while (*str != '\0' && isspace_ascii(*str)) { str++; @@ -1916,8 +1888,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, ++p; } + // Handle sign. + const bool has_sign = *p == '-' || *p == '+'; + // Handle sign. + const char *digit_start = has_sign ? p + 1 : p; + // Check that there is a first digit. - if (!has_digit_int(p)) { + if (!isdigit_ascii(*digit_start)) { // Error... *error = ERROR_NO_DIGITS; return 0; From cb60adb10ab9f6d1cb9bc5f6f6f0180d047281b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 19:07:56 -0300 Subject: [PATCH 59/65] fix: improve diff for trailing whitespace --- pandas/_libs/src/parser/tokenizer.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 7f3a632982f54..5a9f3bfedb8a6 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1839,13 +1839,6 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -static inline bool has_only_spaces(const char *str) { - while (*str != '\0' && isspace_ascii(*str)) { - str++; - } - return *str == '\0'; -} - /* Copy a string without `char_to_remove` into `output`. */ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], @@ -1917,10 +1910,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *endptr; int64_t result = strtoll(p, &endptr, 10); + // Skip trailing spaces. + while (isspace_ascii(*endptr)) { + ++endptr; + } + // Did we use up all the characters? - if (!has_only_spaces(endptr)) { - // Check first for invalid characters because we may - // want to skip integer parsing if we find one. + if (*endptr) { *error = ERROR_INVALID_CHARS; result = 0; } else if (errno == ERANGE || result > int_max || result < int_min) { @@ -1974,8 +1970,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, char *endptr; uint64_t result = strtoull(p, &endptr, 10); + // Skip trailing spaces. + while (isspace_ascii(*endptr)) { + ++endptr; + } + // Did we use up all the characters? - if (!has_only_spaces(endptr)) { + if (*endptr) { *error = ERROR_INVALID_CHARS; result = 0; } else if (errno == ERANGE || result > uint_max) { From 5117e89658f53b720ac1ed3544d670127378b18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 19:24:56 -0300 Subject: [PATCH 60/65] chore: remove newline to simplify diff even more --- pandas/_libs/src/parser/tokenizer.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 5a9f3bfedb8a6..37d77de06fba6 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1897,13 +1897,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, const size_t str_len = strlen(p); if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { // Word is too big, probably will cause an overflow *error = ERROR_OVERFLOW; return 0; } - p = buffer; } @@ -1958,7 +1956,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, const size_t str_len = strlen(p); if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { // Word is too big, probably will cause an overflow *error = ERROR_OVERFLOW; From e1e327a614c151c93e494c9f345c59571c66bac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 19:37:28 -0300 Subject: [PATCH 61/65] chore: drop another superfluous comment --- pandas/_libs/src/parser/tokenizer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 37d77de06fba6..d9a754522251d 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1868,7 +1868,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], left = right + 1; } - // null terminate output[bytes_written] = '\0'; return 0; } From ffcb7c21f2bee19edabebd0775f919e5165886d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 20:00:11 -0300 Subject: [PATCH 62/65] test: xfail python engine with consecutive thousand separators --- pandas/tests/io/parser/common/test_common_basic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index f38844e167222..766dabba851e0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -82,7 +82,7 @@ def test_read_csv_local(all_parsers, csv1): ("2,,3,4,,,,,,,,,,,,5", 2345), ], ) -def test_1000_sep(all_parsers, number_csv, expected_number): +def test_1000_sep(all_parsers, number_csv, expected_number, request): parser = all_parsers data = f"""A|B|C 1|{number_csv}|5 @@ -95,6 +95,11 @@ def test_1000_sep(all_parsers, number_csv, expected_number): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), sep="|", thousands=",") return + elif parser.engine == "python" and ",," in number_csv: + mark = pytest.mark.xfail( + reason="Python engine doesn't allow consecutive thousands separators" + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected) From 47b87f9aeed4c0bb836505173a4bef7a661e23ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 21:52:01 -0300 Subject: [PATCH 63/65] fix: move errno handling to avoid polution and early return --- pandas/_libs/src/parser/tokenizer.c | 30 +++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index d9a754522251d..4c2a52d6198da 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1907,6 +1907,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, char *endptr; int64_t result = strtoll(p, &endptr, 10); + if (errno == ERANGE || result > int_max || result < int_min) { + *error = ERROR_OVERFLOW; + errno = 0; + return 0; + } + // Skip trailing spaces. while (isspace_ascii(*endptr)) { ++endptr; @@ -1915,15 +1921,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, // Did we use up all the characters? if (*endptr) { *error = ERROR_INVALID_CHARS; - result = 0; - } else if (errno == ERANGE || result > int_max || result < int_min) { - *error = ERROR_OVERFLOW; - errno = 0; - result = 0; - } else { - *error = 0; + return 0; } + *error = 0; return result; } @@ -1966,6 +1967,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, char *endptr; uint64_t result = strtoull(p, &endptr, 10); + if (errno == ERANGE || result > uint_max) { + *error = ERROR_OVERFLOW; + errno = 0; + return 0; + } + // Skip trailing spaces. while (isspace_ascii(*endptr)) { ++endptr; @@ -1974,18 +1981,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, // Did we use up all the characters? if (*endptr) { *error = ERROR_INVALID_CHARS; - result = 0; - } else if (errno == ERANGE || result > uint_max) { - *error = ERROR_OVERFLOW; - errno = 0; - result = 0; - } else { - *error = 0; + return 0; } if (result > (uint64_t)int_max) { state->seen_uint = 1; } + *error = 0; return result; } From cd536fbd4893cfdafaef8a07cf60c91da3d52aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Wed, 15 Oct 2025 22:04:08 -0300 Subject: [PATCH 64/65] chore: rename to number for diff --- pandas/_libs/src/parser/tokenizer.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4c2a52d6198da..b77e8ab2254a3 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1905,9 +1905,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } char *endptr; - int64_t result = strtoll(p, &endptr, 10); + int64_t number = strtoll(p, &endptr, 10); - if (errno == ERANGE || result > int_max || result < int_min) { + if (errno == ERANGE || number > int_max || number < int_min) { *error = ERROR_OVERFLOW; errno = 0; return 0; @@ -1925,7 +1925,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } *error = 0; - return result; + return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, @@ -1965,9 +1965,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, } char *endptr; - uint64_t result = strtoull(p, &endptr, 10); + uint64_t number = strtoull(p, &endptr, 10); - if (errno == ERANGE || result > uint_max) { + if (errno == ERANGE || number > uint_max) { *error = ERROR_OVERFLOW; errno = 0; return 0; @@ -1984,10 +1984,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - if (result > (uint64_t)int_max) { + if (number > (uint64_t)int_max) { state->seen_uint = 1; } *error = 0; - return result; + return number; } From 1ef92593e3891859982426d67daf728c8621b362 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 16 Oct 2025 10:56:43 -0300 Subject: [PATCH 65/65] chore: add comment explaining consecutive thousand separators in C engine --- pandas/tests/io/parser/common/test_common_basic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 766dabba851e0..88dd0543b9020 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -78,6 +78,8 @@ def test_read_csv_local(all_parsers, csv1): ("2,334", 2334), ("-2,334", -2334), ("-2,334,", -2334), + # Multiple consecutive thousand separators are allowed in C engine, + # but it's not necessarily intended behavior and may change in the future. ("2,,,,,,,,,,,,,,,5", 25), ("2,,3,4,,,,,,,,,,,,5", 2345), ],