pandas-dev · heoh · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -1716,63 +1716,148 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
   return number;
 }
 
+/* Helper functions and macros for string consumption and buffer management */
+
+static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src,
+                                    size_t src_sz, const char *charset) {
+  size_t span_sz = strspn(*src, charset);
+  if (span_sz > src_sz) {
+    span_sz = src_sz;
+  }
+  // Assuming the span size is within the expected range.
+  if (span_sz > INT_MAX) {
+    return -1;
+  }
+  if (dst) {
+    if (span_sz > dst_sz) {
+      return -1;
+    }
+    memcpy(*dst, *src, span_sz);
+    *dst += span_sz;
+  }
+  *src += span_sz;
+  return (int)span_sz;
+}
+
+static inline int str_consume_span(char **dst, size_t dst_sz, const char **src,
+                                   const char *charset) {
+  return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset);
+}
+
+#define SKIP_SPAN(s, charset) str_consume_span(NULL, 0, &(s), (charset))
+
+#define SKIP_NSPAN(s, n, charset)                                              \
+  str_consume_nspan(NULL, 0, &(s), (n), (charset))
+
+#define SAFE_CONSUME_SPAN(d, de, s, charset)                                   \
+  do {                                                                         \
+    size_t _remaining = (de) - (d);                                            \
+    int _ret = str_consume_span(&(d), _remaining, &(s), (charset));            \
+    if (_ret < 0)                                                              \
+      return ERROR_OVERFLOW;                                                   \
+  } while (0)
+
+#define SAFE_CONSUME_NSPAN(d, de, s, n, charset)                               \
+  do {                                                                         \
+    size_t _remaining = (de) - (d);                                            \
+    if (str_consume_nspan(&(d), _remaining, &(s), (n), (charset)) < 0)         \
+      return ERROR_OVERFLOW;                                                   \
+  } while (0)
+
+#define CHECK_BUFFER_SPACE(d, de)                                              \
+  do {                                                                         \
+    if ((d) >= (de))                                                           \
+      return ERROR_OVERFLOW;                                                   \
+  } while (0)
+
 /* copy a decimal number string with `decimal`, `tsep` as decimal point
    and thousands separator to an equivalent c-locale decimal string (striping
-   `tsep`, replacing `decimal` with '.'). The returned memory should be free-d
-   with a call to `free`.
-*/
+   `tsep`, replacing `decimal` with '.'). The result is written into `dst`
+   and null-terminated. */
+
+static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src,
+                                   char **endpos, char decimal, char tsep) {
+  static const char *digits = "0123456789";
+  static const char *exponents = "Ee";
+  static const char *signs = "+-";
+  static const char *whitespaces = " \t\n\v\f\r";
+
+  const char decimals[] = {decimal, '\0'};
+  const char tseps[] = {tsep, '\0'};
+
+  const char *s = src;
+  char *d = dst;
+  const char *de = dst + dst_sz;
+  bool seen_digit = false;
+  int ret;
+
+  if (endpos != NULL)
+    *endpos = (char *)s;
+
+  // Skip leading whitespace
+  SKIP_SPAN(s, whitespaces);
+
+  // Copy leading sign (optional)
+  SAFE_CONSUME_NSPAN(d, de, s, 1, signs);
 
-static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
-                                     char tsep) {
-  const char *p = s;
-  const size_t length = strlen(s);
-  char *s_copy = malloc(length + 1);
-  char *dst = s_copy;
-  // Skip leading whitespace.
-  while (isspace_ascii(*p))
-    p++;
-  // Copy Leading sign
-  if (*p == '+' || *p == '-') {
-    *dst++ = *p++;
-  }
   // Copy integer part dropping `tsep`
-  while (isdigit_ascii(*p)) {
-    *dst++ = *p++;
-    p += (tsep != '\0' && *p == tsep);
+  while ((ret = str_consume_span(&d, de - d, &s, digits))) {
+    if (ret < 0)
+      return ERROR_OVERFLOW;
+    seen_digit = true;
+    SKIP_SPAN(s, tseps);
   }
+
   // Replace `decimal` with '.'
-  if (*p == decimal) {
-    *dst++ = '.';
-    p++;
+  if (SKIP_NSPAN(s, 1, decimals)) {
+    CHECK_BUFFER_SPACE(d, de);
+    *d++ = '.';
   }
+
   // Copy fractional part after decimal (if any)
-  while (isdigit_ascii(*p)) {
-    *dst++ = *p++;
+  if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) {
+    seen_digit = true;
+  } else if (ret < 0) {
+    return ERROR_OVERFLOW;
+  }
+
+  if (!seen_digit) {
+    // No digits found in integer or fractional part
+    return ERROR_NO_DIGITS;
   }
+
   // Copy exponent if any
-  if (toupper_ascii(*p) == toupper_ascii('E')) {
-    *dst++ = *p++;
-    // Copy leading exponent sign (if any)
-    if (*p == '+' || *p == '-') {
-      *dst++ = *p++;
-    }
-    // Copy exponent digits
-    while (isdigit_ascii(*p)) {
-      *dst++ = *p++;
-    }
+  if (decimal && (ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) {
+    SAFE_CONSUME_NSPAN(d, de, s, 1, signs);
+    SAFE_CONSUME_SPAN(d, de, s, digits);
+  } else if (ret < 0) {
+    return ERROR_OVERFLOW;
   }
-  *dst++ = '\0'; // terminate
+
+  // Skip trailing whitespace
+  SKIP_SPAN(s, whitespaces);
+
+  // Terminate string
+  CHECK_BUFFER_SPACE(d, de);
+  *d++ = '\0';
+
+  // Did we use up all the characters?
+  if (*s)
+    return ERROR_INVALID_CHARS;
+
   if (endpos != NULL)
-    *endpos = (char *)p;
-  return s_copy;
+    *endpos = (char *)s;
+
+  return 0;
 }
 
 double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
                   char tsep, int skip_trailing, int *error, int *maybe_int) {
   // 'normalize' representation to C-locale; replace decimal with '.' and
   // remove thousands separator.
+  char pc[PROCESSED_WORD_CAPACITY];
   char *endptr;
-  char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
+  _str_copy_decimal_str_c(pc, sizeof(pc), p, &endptr, decimal, tsep);
   // This is called from a nogil block in parsers.pyx
   // so need to explicitly get GIL before Python calls
   PyGILState_STATE gstate = PyGILState_Ensure();
@@ -1799,7 +1884,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
   PyErr_Clear();
 
   PyGILState_Release(gstate);
-  free(pc);
   if (skip_trailing && q != NULL && *q != p) {
     while (isspace_ascii(**q)) {
       (*q)++;
@@ -1821,150 +1905,53 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-/* Copy a string without `char_to_remove` into `output`.
- */
-static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
-                                    const char *str, size_t str_len,
-                                    char char_to_remove) {
-  const char *left = str;
-  const char *end_ptr = str + str_len;
-  size_t bytes_written = 0;
-
-  while (left < end_ptr) {
-    const size_t remaining_bytes_to_read = end_ptr - left;
-    const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
-
-    if (!right) {
-      // If it doesn't find the char to remove, just copy until EOS.
-      right = end_ptr;
-    }
-
-    const size_t chunk_size = right - left;
-
-    if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
-      return -1;
-    }
-    memcpy(&output[bytes_written], left, chunk_size);
-    bytes_written += chunk_size;
-
-    left = right + 1;
-  }
-
-  output[bytes_written] = '\0';
-  return 0;
-}
-
 int64_t str_to_int64(const char *p_item, int *error, char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
-  }
-
-  // Handle sign.
-  const bool has_sign = *p == '-' || *p == '+';
-  // Handle sign.
-  const char *digit_start = has_sign ? p + 1 : p;
-
-  // Check that there is a first digit.
-  if (!isdigit_ascii(*digit_start)) {
-    // Error...
-    *error = ERROR_NO_DIGITS;
-    return 0;
-  }
-
   char buffer[PROCESSED_WORD_CAPACITY];
-  const size_t str_len = strlen(p);
-  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
-    const int status = copy_string_without_char(buffer, p, str_len, tsep);
-    if (status != 0) {
-      // Word is too big, probably will cause an overflow
-      *error = ERROR_OVERFLOW;
-      return 0;
-    }
-    p = buffer;
+  char *endptr;
+  int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
+                                       &endptr, '\0', tsep);
+  if (status != 0) {
+    *error = status;
+    return 0;
   }
 
-  char *endptr;
-  int64_t number = strtoll(p, &endptr, 10);
+  int64_t number = strtoll(buffer, &endptr, 10);
 
   if (errno == ERANGE) {
     *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
     errno = 0;
     return 0;
   }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*endptr)) {
-    ++endptr;
-  }
-
-  // Did we use up all the characters?
-  if (*endptr) {
-    *error = ERROR_INVALID_CHARS;
-    return 0;
-  }
-
   *error = 0;
   return number;
 }
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
                        char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  char buffer[PROCESSED_WORD_CAPACITY];
+  char *endptr;
+  int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
+                                       &endptr, '\0', tsep);
+  if (status != 0) {
+    *error = status;
+    return 0;
   }
 
-  // Handle sign.
-  if (*p == '-') {
+  if (buffer[0] == '-') {
     state->seen_sint = 1;
     *error = 0;
     return 0;
-  } else if (*p == '+') {
-    p++;
-  }
-
-  // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
-    // Error...
-    *error = ERROR_NO_DIGITS;
-    return 0;
-  }
-
-  char buffer[PROCESSED_WORD_CAPACITY];
-  const size_t str_len = strlen(p);
-  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
-    const int status = copy_string_without_char(buffer, p, str_len, tsep);
-    if (status != 0) {
-      // Word is too big, probably will cause an overflow
-      *error = ERROR_OVERFLOW;
-      return 0;
-    }
-    p = buffer;
   }
 
-  char *endptr;
-  uint64_t number = strtoull(p, &endptr, 10);
+  uint64_t number = strtoull(buffer, &endptr, 10);
 
   if (errno == ERANGE) {
     *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
     errno = 0;
     return 0;
   }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*endptr)) {
-    ++endptr;
-  }
-
-  // Did we use up all the characters?
-  if (*endptr) {
-    *error = ERROR_INVALID_CHARS;
-    return 0;
-  }
-
   if (number > (uint64_t)INT64_MAX) {
     state->seen_uint = 1;
   }