From 7a2882bfdd6afde2fa206b71c917eddf898c5382 Mon Sep 17 00:00:00 2001 From: heoh Date: Mon, 20 Oct 2025 15:11:45 +0000 Subject: [PATCH 01/12] refactor(_str_copy_decimal_str_c): Use parsing functions from stdlib --- pandas/_libs/src/parser/tokenizer.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a5cfd0e13ceec..c5913d73b0ff4 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1740,20 +1740,25 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { + const char *digits = "0123456789"; + const char *whitespaces = " \t\n\v\f\r"; + const char *p = s; const size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; + size_t n_digits; // Skip leading whitespace. - while (isspace_ascii(*p)) - p++; + p += strspn(p, whitespaces); // Copy Leading sign if (*p == '+' || *p == '-') { *dst++ = *p++; } // Copy integer part dropping `tsep` - while (isdigit_ascii(*p)) { - *dst++ = *p++; + while ((n_digits = strspn(p, digits))) { + memcpy(dst, p, n_digits); + dst += n_digits; + p += n_digits; p += (tsep != '\0' && *p == tsep); } // Replace `decimal` with '.' @@ -1762,8 +1767,10 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, p++; } // Copy fractional part after decimal (if any) - while (isdigit_ascii(*p)) { - *dst++ = *p++; + if ((n_digits = strspn(p, digits))) { + memcpy(dst, p, n_digits); + dst += n_digits; + p += n_digits; } // Copy exponent if any if (toupper_ascii(*p) == toupper_ascii('E')) { @@ -1773,8 +1780,10 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, *dst++ = *p++; } // Copy exponent digits - while (isdigit_ascii(*p)) { - *dst++ = *p++; + if ((n_digits = strspn(p, digits))) { + memcpy(dst, p, n_digits); + dst += n_digits; + p += n_digits; } } *dst++ = '\0'; // terminate From 4a6b66bb3a0a3c8d8d3cee8174afb387e927bd09 Mon Sep 17 00:00:00 2001 From: heoh Date: Mon, 20 Oct 2025 16:07:59 +0000 Subject: [PATCH 02/12] refactor: extract str_consume_span() function --- pandas/_libs/src/parser/tokenizer.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index c5913d73b0ff4..1fc1da4424637 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1732,6 +1732,17 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } +static inline size_t str_consume_span(char **dst, const char **src, + const char *charset) { + size_t n = strspn(*src, charset); + if (n) { + memcpy(*dst, *src, n); + *dst += n; + *src += n; + } + return n; +} + /* copy a decimal number string with `decimal`, `tsep` as decimal point and thousands separator to an equivalent c-locale decimal string (striping `tsep`, replacing `decimal` with '.'). The returned memory should be free-d @@ -1747,7 +1758,6 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, const size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; - size_t n_digits; // Skip leading whitespace. p += strspn(p, whitespaces); // Copy Leading sign @@ -1755,10 +1765,7 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, *dst++ = *p++; } // Copy integer part dropping `tsep` - while ((n_digits = strspn(p, digits))) { - memcpy(dst, p, n_digits); - dst += n_digits; - p += n_digits; + while (str_consume_span(&dst, &p, digits)) { p += (tsep != '\0' && *p == tsep); } // Replace `decimal` with '.' @@ -1767,11 +1774,7 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, p++; } // Copy fractional part after decimal (if any) - if ((n_digits = strspn(p, digits))) { - memcpy(dst, p, n_digits); - dst += n_digits; - p += n_digits; - } + str_consume_span(&dst, &p, digits); // Copy exponent if any if (toupper_ascii(*p) == toupper_ascii('E')) { *dst++ = *p++; @@ -1780,11 +1783,7 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, *dst++ = *p++; } // Copy exponent digits - if ((n_digits = strspn(p, digits))) { - memcpy(dst, p, n_digits); - dst += n_digits; - p += n_digits; - } + str_consume_span(&dst, &p, digits); } *dst++ = '\0'; // terminate if (endpos != NULL) From d2ce67de2476660ac3a9488d8261820d62a5f485 Mon Sep 17 00:00:00 2001 From: heoh Date: Mon, 20 Oct 2025 16:14:09 +0000 Subject: [PATCH 03/12] refactor: Remove unnecessary conversions --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 1fc1da4424637..88fa15ee2736f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1776,7 +1776,7 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, // Copy fractional part after decimal (if any) str_consume_span(&dst, &p, digits); // Copy exponent if any - if (toupper_ascii(*p) == toupper_ascii('E')) { + if (toupper_ascii(*p) == 'E') { *dst++ = *p++; // Copy leading exponent sign (if any) if (*p == '+' || *p == '-') { From 7b0be60969f7a120da17cbf2bb05b4a7a795f857 Mon Sep 17 00:00:00 2001 From: heoh Date: Mon, 20 Oct 2025 17:22:28 +0000 Subject: [PATCH 04/12] refactor: Simplify parsing logic by `str_consume_span()` --- pandas/_libs/src/parser/tokenizer.c | 48 ++++++++++++++++------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 88fa15ee2736f..4c0ffda6fa95c 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1732,15 +1732,25 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } +static inline size_t str_consume_nspan(char **dst, const char **src, + const char *charset, size_t count) { + size_t size = strspn(*src, charset); + if (size > count) { + size = count; + } + if (size) { + if (dst) { + memcpy(*dst, *src, size); + *dst += size; + } + *src += size; + } + return size; +} + static inline size_t str_consume_span(char **dst, const char **src, const char *charset) { - size_t n = strspn(*src, charset); - if (n) { - memcpy(*dst, *src, n); - *dst += n; - *src += n; - } - return n; + return str_consume_nspan(dst, src, charset, SIZE_MAX); } /* copy a decimal number string with `decimal`, `tsep` as decimal point @@ -1752,37 +1762,33 @@ static inline size_t str_consume_span(char **dst, const char **src, static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, char tsep) { const char *digits = "0123456789"; + const char *exponents = "Ee"; + const char *signs = "+-"; const char *whitespaces = " \t\n\v\f\r"; + const char decimals[] = {decimal, '\0'}; + const char tseps[] = {tsep, '\0'}; const char *p = s; const size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; // Skip leading whitespace. - p += strspn(p, whitespaces); + str_consume_span(NULL, &p, whitespaces); // Copy Leading sign - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } + str_consume_nspan(&dst, &p, signs, 1); // Copy integer part dropping `tsep` while (str_consume_span(&dst, &p, digits)) { - p += (tsep != '\0' && *p == tsep); + str_consume_nspan(NULL, &p, tseps, 1); } // Replace `decimal` with '.' - if (*p == decimal) { + if (str_consume_nspan(NULL, &p, decimals, 1)) { *dst++ = '.'; - p++; } // Copy fractional part after decimal (if any) str_consume_span(&dst, &p, digits); // Copy exponent if any - if (toupper_ascii(*p) == 'E') { - *dst++ = *p++; - // Copy leading exponent sign (if any) - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy exponent digits + if (str_consume_nspan(&dst, &p, exponents, 1)) { + str_consume_nspan(&dst, &p, signs, 1); str_consume_span(&dst, &p, digits); } *dst++ = '\0'; // terminate From 4cfc5ebf639bd3be5a97adc28a201e74485bbd65 Mon Sep 17 00:00:00 2001 From: heoh Date: Thu, 23 Oct 2025 15:44:26 +0000 Subject: [PATCH 05/12] Prevent overflow for dst --- pandas/_libs/src/parser/tokenizer.c | 50 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4c0ffda6fa95c..1289f59abf42e 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1732,25 +1732,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } -static inline size_t str_consume_nspan(char **dst, const char **src, - const char *charset, size_t count) { - size_t size = strspn(*src, charset); - if (size > count) { - size = count; - } - if (size) { - if (dst) { - memcpy(*dst, *src, size); - *dst += size; +static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, + size_t src_sz, const char *charset) { + size_t span_sz = strspn(*src, charset); + if (span_sz > src_sz) { + span_sz = src_sz; + } + if (dst) { + if (span_sz > dst_sz) { + return -1; } - *src += size; + memcpy(*dst, *src, span_sz); + *dst += span_sz; } - return size; + *src += span_sz; + return span_sz; } -static inline size_t str_consume_span(char **dst, const char **src, - const char *charset) { - return str_consume_nspan(dst, src, charset, SIZE_MAX); +static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, + const char *charset) { + return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset); } /* copy a decimal number string with `decimal`, `tsep` as decimal point @@ -1772,24 +1773,25 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, const size_t length = strlen(s); char *s_copy = malloc(length + 1); char *dst = s_copy; + char *dst_end = dst + length; // Skip leading whitespace. - str_consume_span(NULL, &p, whitespaces); + str_consume_span(NULL, 0, &p, whitespaces); // Copy Leading sign - str_consume_nspan(&dst, &p, signs, 1); + str_consume_nspan(&dst, dst_end - dst, &p, 1, signs); // Copy integer part dropping `tsep` - while (str_consume_span(&dst, &p, digits)) { - str_consume_nspan(NULL, &p, tseps, 1); + while (str_consume_span(&dst, dst_end - dst, &p, digits)) { + str_consume_nspan(NULL, 0, &p, 1, tseps); } // Replace `decimal` with '.' - if (str_consume_nspan(NULL, &p, decimals, 1)) { + if (str_consume_nspan(NULL, 0, &p, 1, decimals)) { *dst++ = '.'; } // Copy fractional part after decimal (if any) - str_consume_span(&dst, &p, digits); + str_consume_span(&dst, dst_end - dst, &p, digits); // Copy exponent if any - if (str_consume_nspan(&dst, &p, exponents, 1)) { - str_consume_nspan(&dst, &p, signs, 1); - str_consume_span(&dst, &p, digits); + if (str_consume_nspan(&dst, dst_end - dst, &p, 1, exponents)) { + str_consume_nspan(&dst, dst_end - dst, &p, 1, signs); + str_consume_span(&dst, dst_end - dst, &p, digits); } *dst++ = '\0'; // terminate if (endpos != NULL) From 4fbac546aead27b5493c9001eb15b1794874e1e1 Mon Sep 17 00:00:00 2001 From: heoh Date: Thu, 23 Oct 2025 21:07:24 +0000 Subject: [PATCH 06/12] refactor: Remove malloc in _str_copy_decimal_str_c and use caller buffer --- pandas/_libs/src/parser/tokenizer.c | 47 ++++++++++++++--------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 1289f59abf42e..32c1d0bdf6cd8 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1756,12 +1756,11 @@ static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, /* copy a decimal number string with `decimal`, `tsep` as decimal point and thousands separator to an equivalent c-locale decimal string (striping - `tsep`, replacing `decimal` with '.'). The returned memory should be free-d - with a call to `free`. -*/ + `tsep`, replacing `decimal` with '.'). The result is written into `dst` + and null-terminated. */ -static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep) { +static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, + char **endpos, char decimal, char tsep) { const char *digits = "0123456789"; const char *exponents = "Ee"; const char *signs = "+-"; @@ -1769,42 +1768,42 @@ static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, const char decimals[] = {decimal, '\0'}; const char tseps[] = {tsep, '\0'}; - const char *p = s; - const size_t length = strlen(s); - char *s_copy = malloc(length + 1); - char *dst = s_copy; - char *dst_end = dst + length; + const char *s = src; + char *d = dst; + char *de = dst + dst_sz; // Skip leading whitespace. - str_consume_span(NULL, 0, &p, whitespaces); + str_consume_span(NULL, 0, &s, whitespaces); // Copy Leading sign - str_consume_nspan(&dst, dst_end - dst, &p, 1, signs); + str_consume_nspan(&d, de - d, &s, 1, signs); // Copy integer part dropping `tsep` - while (str_consume_span(&dst, dst_end - dst, &p, digits)) { - str_consume_nspan(NULL, 0, &p, 1, tseps); + while (str_consume_span(&d, de - d, &s, digits)) { + str_consume_nspan(NULL, 0, &s, 1, tseps); } // Replace `decimal` with '.' - if (str_consume_nspan(NULL, 0, &p, 1, decimals)) { - *dst++ = '.'; + if (str_consume_nspan(NULL, 0, &s, 1, decimals)) { + *d++ = '.'; } // Copy fractional part after decimal (if any) - str_consume_span(&dst, dst_end - dst, &p, digits); + str_consume_span(&d, de - d, &s, digits); // Copy exponent if any - if (str_consume_nspan(&dst, dst_end - dst, &p, 1, exponents)) { - str_consume_nspan(&dst, dst_end - dst, &p, 1, signs); - str_consume_span(&dst, dst_end - dst, &p, digits); + if (str_consume_nspan(&d, de - d, &s, 1, exponents)) { + str_consume_nspan(&d, de - d, &s, 1, signs); + str_consume_span(&d, de - d, &s, digits); } - *dst++ = '\0'; // terminate + *d++ = '\0'; // terminate if (endpos != NULL) - *endpos = (char *)p; - return s_copy; + *endpos = (char *)s; + return 0; } double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove thousands separator. + const size_t length = strlen(p); + char *pc = malloc(length + 1); char *endptr; - char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); + _str_copy_decimal_str_c(pc, length + 1, p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate = PyGILState_Ensure(); From 05504cfbf31825cd2e23e1c715e16f884ca92757 Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 16:28:28 +0000 Subject: [PATCH 07/12] refactor: Extract macro for readability --- pandas/_libs/src/parser/tokenizer.c | 81 +++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 32c1d0bdf6cd8..5b0036050fd9f 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1732,6 +1732,8 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } +/* Helper functions and macros for string consumption and buffer management */ + static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, size_t src_sz, const char *charset) { size_t span_sz = strspn(*src, charset); @@ -1754,6 +1756,32 @@ static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset); } +#define SKIP_SPAN(s, charset) str_consume_span(NULL, 0, &(s), (charset)) + +#define SKIP_NSPAN(s, n, charset) \ + str_consume_nspan(NULL, 0, &(s), (n), (charset)) + +#define SAFE_CONSUME_SPAN(d, de, s, charset) \ + do { \ + size_t _remaining = (de) - (d); \ + int _ret = str_consume_span(&(d), _remaining, &(s), (charset)); \ + if (_ret < 0) \ + return ERROR_OVERFLOW; \ + } while (0) + +#define SAFE_CONSUME_NSPAN(d, de, s, n, charset) \ + do { \ + size_t _remaining = (de) - (d); \ + if (str_consume_nspan(&(d), _remaining, &(s), (n), (charset)) < 0) \ + return ERROR_OVERFLOW; \ + } while (0) + +#define CHECK_BUFFER_SPACE(d, de) \ + do { \ + if ((d) >= (de)) \ + return ERROR_OVERFLOW; \ + } while (0) + /* copy a decimal number string with `decimal`, `tsep` as decimal point and thousands separator to an equivalent c-locale decimal string (striping `tsep`, replacing `decimal` with '.'). The result is written into `dst` @@ -1761,38 +1789,59 @@ static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, char **endpos, char decimal, char tsep) { - const char *digits = "0123456789"; - const char *exponents = "Ee"; - const char *signs = "+-"; - const char *whitespaces = " \t\n\v\f\r"; + static const char *digits = "0123456789"; + static const char *exponents = "Ee"; + static const char *signs = "+-"; + static const char *whitespaces = " \t\n\v\f\r"; + const char decimals[] = {decimal, '\0'}; const char tseps[] = {tsep, '\0'}; const char *s = src; char *d = dst; - char *de = dst + dst_sz; + const char *de = dst + dst_sz; + int ret; + + if (endpos != NULL) + *endpos = (char *)s; + // Skip leading whitespace. - str_consume_span(NULL, 0, &s, whitespaces); - // Copy Leading sign - str_consume_nspan(&d, de - d, &s, 1, signs); + SKIP_SPAN(s, whitespaces); + + // Copy leading sign (optional) + SAFE_CONSUME_NSPAN(d, de, s, 1, signs); + // Copy integer part dropping `tsep` - while (str_consume_span(&d, de - d, &s, digits)) { - str_consume_nspan(NULL, 0, &s, 1, tseps); + while ((ret = str_consume_span(&d, de - d, &s, digits))) { + if (ret < 0) + return ERROR_OVERFLOW; + SKIP_NSPAN(s, 1, tseps); } + // Replace `decimal` with '.' - if (str_consume_nspan(NULL, 0, &s, 1, decimals)) { + if (SKIP_NSPAN(s, 1, decimals)) { + CHECK_BUFFER_SPACE(d, de); *d++ = '.'; } + // Copy fractional part after decimal (if any) - str_consume_span(&d, de - d, &s, digits); + SAFE_CONSUME_SPAN(d, de, s, digits); + // Copy exponent if any - if (str_consume_nspan(&d, de - d, &s, 1, exponents)) { - str_consume_nspan(&d, de - d, &s, 1, signs); - str_consume_span(&d, de - d, &s, digits); + if ((ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { + SAFE_CONSUME_NSPAN(d, de, s, 1, signs); + SAFE_CONSUME_SPAN(d, de, s, digits); + } else if (ret < 0) { + return ERROR_OVERFLOW; } - *d++ = '\0'; // terminate + + // Terminate string + CHECK_BUFFER_SPACE(d, de); + *d++ = '\0'; + if (endpos != NULL) *endpos = (char *)s; + return 0; } From 9de4b2aff21dc5916bee82d918aa0a04c82cd8fa Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 17:47:19 +0000 Subject: [PATCH 08/12] refactor: Consolidate `_str_copy_decimal_str_c` and `copy_string_without_char` --- pandas/_libs/src/parser/tokenizer.c | 157 ++++++++-------------------- 1 file changed, 41 insertions(+), 116 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 5b0036050fd9f..99f70bf2267e3 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1800,22 +1800,28 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, const char *s = src; char *d = dst; const char *de = dst + dst_sz; + bool seen_digit = false; int ret; if (endpos != NULL) *endpos = (char *)s; - // Skip leading whitespace. + // Skip leading whitespace SKIP_SPAN(s, whitespaces); // Copy leading sign (optional) SAFE_CONSUME_NSPAN(d, de, s, 1, signs); + // Check that there is a first digit or decimal point. + if (!isdigit_ascii(*s) && *s != decimal) + return ERROR_NO_DIGITS; + // Copy integer part dropping `tsep` while ((ret = str_consume_span(&d, de - d, &s, digits))) { if (ret < 0) return ERROR_OVERFLOW; - SKIP_NSPAN(s, 1, tseps); + seen_digit = true; + SKIP_SPAN(s, tseps); } // Replace `decimal` with '.' @@ -1825,7 +1831,16 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, } // Copy fractional part after decimal (if any) - SAFE_CONSUME_SPAN(d, de, s, digits); + if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) { + seen_digit = true; + } else if (ret < 0) { + return ERROR_OVERFLOW; + } + + if (!seen_digit) { + // No digits found in integer or fractional part + return ERROR_NO_DIGITS; + } // Copy exponent if any if ((ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { @@ -1835,10 +1850,17 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, return ERROR_OVERFLOW; } + // Skip trailing whitespace + SKIP_SPAN(s, whitespaces); + // Terminate string CHECK_BUFFER_SPACE(d, de); *d++ = '\0'; + // Did we use up all the characters? + if (*s) + return ERROR_INVALID_CHARS; + if (endpos != NULL) *endpos = (char *)s; @@ -1903,72 +1925,17 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -/* Copy a string without `char_to_remove` into `output`. - */ -static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], - const char *str, size_t str_len, - char char_to_remove) { - const char *left = str; - const char *end_ptr = str + str_len; - size_t bytes_written = 0; - - while (left < end_ptr) { - const size_t remaining_bytes_to_read = end_ptr - left; - const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); - - if (!right) { - // If it doesn't find the char to remove, just copy until EOS. - right = end_ptr; - } - - const size_t chunk_size = right - left; - - if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { - return -1; - } - memcpy(&output[bytes_written], left, chunk_size); - bytes_written += chunk_size; - - left = right + 1; - } - - output[bytes_written] = '\0'; - return 0; -} - int64_t str_to_int64(const char *p_item, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Handle sign. - const bool has_sign = *p == '-' || *p == '+'; - // Handle sign. - const char *digit_start = has_sign ? p + 1 : p; - - // Check that there is a first digit. - if (!isdigit_ascii(*digit_start)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p); - if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { - // Word is too big, probably will cause an overflow - *error = ERROR_OVERFLOW; - return 0; - } - p = buffer; + char *endptr; + int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, + &endptr, '\0', tsep); + if (status != 0) { + *error = status; + return 0; } - char *endptr; - int64_t number = strtoll(p, &endptr, 10); + int64_t number = strtoll(buffer, &endptr, 10); if (errno == ERANGE) { *error = ERROR_OVERFLOW; @@ -1976,59 +1943,28 @@ int64_t str_to_int64(const char *p_item, int *error, char tsep) { return 0; } - // Skip trailing spaces. - while (isspace_ascii(*endptr)) { - ++endptr; - } - - // Did we use up all the characters? - if (*endptr) { - *error = ERROR_INVALID_CHARS; - return 0; - } - *error = 0; return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; + char buffer[PROCESSED_WORD_CAPACITY]; + char *endptr; + int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, + &endptr, '\0', tsep); + if (status != 0) { + *error = status; + return 0; } - // Handle sign. - if (*p == '-') { + if (buffer[0] == '-') { state->seen_sint = 1; *error = 0; return 0; - } else if (*p == '+') { - p++; } - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p); - if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { - // Word is too big, probably will cause an overflow - *error = ERROR_OVERFLOW; - return 0; - } - p = buffer; - } - - char *endptr; - uint64_t number = strtoull(p, &endptr, 10); + uint64_t number = strtoull(buffer, &endptr, 10); if (errno == ERANGE) { *error = ERROR_OVERFLOW; @@ -2036,17 +1972,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error, return 0; } - // Skip trailing spaces. - while (isspace_ascii(*endptr)) { - ++endptr; - } - - // Did we use up all the characters? - if (*endptr) { - *error = ERROR_INVALID_CHARS; - return 0; - } - if (number > (uint64_t)INT64_MAX) { state->seen_uint = 1; } From 9ff0af16bab0d3977b5d71156295fd1b28a73135 Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 18:07:25 +0000 Subject: [PATCH 09/12] refactor: Use stack buffer in round_trip function Remove heap allocation (malloc/free) in round_trip by using stack-allocated buffer with PROCESSED_WORD_CAPACITY. This improves performance and simplifies memory management. --- pandas/_libs/src/parser/tokenizer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 99f70bf2267e3..de3665c7af6b6 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1871,10 +1871,9 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove thousands separator. - const size_t length = strlen(p); - char *pc = malloc(length + 1); + char pc[PROCESSED_WORD_CAPACITY]; char *endptr; - _str_copy_decimal_str_c(pc, length + 1, p, &endptr, decimal, tsep); + _str_copy_decimal_str_c(pc, sizeof(pc), p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate = PyGILState_Ensure(); @@ -1903,7 +1902,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), PyErr_Clear(); PyGILState_Release(gstate); - free(pc); if (skip_trailing && q != NULL && *q != p) { while (isspace_ascii(**q)) { (*q)++; From ea2c2067e9be9ca3d7b8641cc8ee03c63ff7c0ac Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 18:23:29 +0000 Subject: [PATCH 10/12] Fix implicit type casting warning --- pandas/_libs/src/parser/tokenizer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index de3665c7af6b6..aaf5c1f055d80 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1740,6 +1740,10 @@ static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, if (span_sz > src_sz) { span_sz = src_sz; } + // Assuming the span size is within the expected range. + if (span_sz > INT_MAX) { + return -1; + } if (dst) { if (span_sz > dst_sz) { return -1; @@ -1748,7 +1752,7 @@ static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, *dst += span_sz; } *src += span_sz; - return span_sz; + return (int)span_sz; } static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, From b48f81b5bc06fee6c5e924b681cbbd924a1a0196 Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 18:50:32 +0000 Subject: [PATCH 11/12] Parse exponential expressions only when decimal is present --- pandas/_libs/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 4c4589a19f96a..1988c3246cc5b 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1831,7 +1831,7 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, } // Copy exponent if any - if ((ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { + if (decimal && (ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { SAFE_CONSUME_NSPAN(d, de, s, 1, signs); SAFE_CONSUME_SPAN(d, de, s, digits); } else if (ret < 0) { From e7fcb653a4b1d988ee03ed0ecbd265c8981799d8 Mon Sep 17 00:00:00 2001 From: heoh Date: Fri, 24 Oct 2025 18:51:32 +0000 Subject: [PATCH 12/12] Remove duplicate guard conditions --- pandas/_libs/src/parser/tokenizer.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 1988c3246cc5b..27a7d2b4c99ec 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1800,10 +1800,6 @@ static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, // Copy leading sign (optional) SAFE_CONSUME_NSPAN(d, de, s, 1, signs); - // Check that there is a first digit or decimal point. - if (!isdigit_ascii(*s) && *s != decimal) - return ERROR_NO_DIGITS; - // Copy integer part dropping `tsep` while ((ret = str_consume_span(&d, de - d, &s, digits))) { if (ret < 0)