From c3cc4a1be02d4514988f9a8473ba5ac18db161ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Sat, 11 Oct 2025 12:09:44 -0300
Subject: [PATCH 01/65] refactor(parser): use integer parsing functions from
 stdlib

---
 .../_libs/include/pandas/parser/tokenizer.h   |   1 +
 pandas/_libs/parsers.pyx                      |   6 +-
 pandas/_libs/src/parser/tokenizer.c           | 282 ++++++++----------
 3 files changed, 136 insertions(+), 153 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index 209f375a5bf6c..0d6bb6df3d123 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,6 +17,7 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
+#define ERROR_NO_MEMORY 4
 
 #include <stdint.h>
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 442891949dfd2..0c93d1c565f1c 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW
+    enum: ERROR_OVERFLOW, ERROR_NO_MEMORY
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -1822,6 +1822,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        if error == ERROR_NO_MEMORY:
+            raise MemoryError()
         return None
 
     if uint64_conflict(&state):
@@ -1892,6 +1894,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
+        if error == ERROR_NO_MEMORY:
+            raise MemoryError()
         return None, None
 
     return result, na_count
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 61e96fc835e4d..8a3dd7986c5e8 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -23,6 +23,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <float.h>
 #include <math.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
@@ -1834,201 +1835,178 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
+/**
+ * @brief Check if the character in the pointer indicates a number.
+ * It expects that you consumed all leading whitespace.
+ *
+ * @param p_item Pointer to verify
+ * @return Non-zero integer indicating that has a digit 0 otherwise.
+ */
+static inline int has_digit_int(const char *str) {
+  if (!str || *str == '\0') {
+    return 0;
+  }
+
+  switch (*str) {
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return 1;
+  case '+':
+  case '-':
+    return isdigit_ascii(str[1]);
+  default:
+    return 0;
+  }
+}
+
+static inline int has_only_spaces(const char *str) {
+  while (*str != '\0' && isspace_ascii(*str)) {
+    str++;
+  }
+  return *str == '\0';
+}
+
+/* Copy a string without `char_to_remove`.
+ * The returned memory should be free-d with a call to `free`.
+ */
+static char *copy_string_without_char(const char *str, char char_to_remove) {
+  size_t chars_to_copy = 0;
+  for (const char *src = str; *src != '\0'; src++) {
+    if (*src != char_to_remove) {
+      chars_to_copy++;
+    }
+  }
+
+  char *start = malloc((chars_to_copy + 1) * sizeof(char));
+  if (!start) {
+    return NULL;
+  }
+
+  char *dst = start;
+  for (const char *src = str; *src != '\0'; src++) {
+    if (*src != char_to_remove) {
+      *dst++ = *src;
+    }
+  }
+  *dst = '\0';
+
+  return start;
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  if (!p_item || *p_item == '\0') {
+    *error = ERROR_NO_DIGITS;
+    return 0;
   }
 
-  // Handle sign.
-  const bool isneg = *p == '-' ? true : false;
-  // Handle sign.
-  if (isneg || (*p == '+')) {
-    p++;
+  while (isspace_ascii(*p_item)) {
+    ++p_item;
   }
 
-  // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
-    // Error...
+  if (!has_digit_int(p_item)) {
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
-  int64_t number = 0;
-  if (isneg) {
-    // If number is greater than pre_min, at least one more digit
-    // can be processed without overflowing.
-    int dig_pre_min = -(int_min % 10);
-    int64_t pre_min = int_min / 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    }
-  } else {
-    // If number is less than pre_max, at least one more digit
-    // can be processed without overflowing.
-    int64_t pre_max = int_max / 10;
-    int dig_pre_max = int_max % 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
-
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
+  char *processed_str = NULL;
 
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
+  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
+    processed_str = copy_string_without_char(p_item, tsep);
+    if (!processed_str) {
+      *error = ERROR_NO_MEMORY;
+      return 0;
     }
+    p_item = processed_str;
   }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
-  }
+  char *endptr = NULL;
+  errno = 0;
+  int64_t result = strtoll(p_item, &endptr, 10);
 
-  // Did we use up all the characters?
-  if (*p) {
+  if (!has_only_spaces(endptr)) {
+    // Check first for invalid characters because we may
+    // want to skip integer parsing if we find one.
     *error = ERROR_INVALID_CHARS;
-    return 0;
+    result = 0;
+  } else if (errno == ERANGE || result > int_max || result < int_min) {
+    *error = ERROR_OVERFLOW;
+    result = 0;
+  } else {
+    *error = 0;
   }
 
-  *error = 0;
-  return number;
+  // free processed_str that
+  // was either allocated due to the presence of tsep
+  // or is NULL
+  free(processed_str);
+
+  return result;
 }
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  if (!p_item || *p_item == '\0') {
+    *error = ERROR_NO_DIGITS;
+    return 0;
   }
 
-  // Handle sign.
-  if (*p == '-') {
+  while (isspace_ascii(*p_item)) {
+    ++p_item;
+  }
+
+  if (*p_item == '-') {
     state->seen_sint = 1;
     *error = 0;
     return 0;
-  } else if (*p == '+') {
-    p++;
+  } else if (*p_item == '+') {
+    p_item++;
   }
 
   // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
-    // Error...
+  if (!isdigit_ascii(*p_item)) {
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
-  // If number is less than pre_max, at least one more digit
-  // can be processed without overflowing.
-  //
-  // Process the digits.
-  uint64_t number = 0;
-  const uint64_t pre_max = uint_max / 10;
-  const uint64_t dig_pre_max = uint_max % 10;
-  char d = *p;
-  if (tsep != '\0') {
-    while (1) {
-      if (d == tsep) {
-        d = *++p;
-        continue;
-      } else if (!isdigit_ascii(d)) {
-        break;
-      }
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
+  char *processed_str = NULL;
 
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
-    }
-  } else {
-    while (isdigit_ascii(d)) {
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
-
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
+  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
+    processed_str = copy_string_without_char(p_item, tsep);
+    if (!processed_str) {
+      *error = ERROR_NO_MEMORY;
+      return 0;
     }
+    p_item = processed_str;
   }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
-  }
+  errno = 0;
+  char *endptr = NULL;
+  uint64_t result = strtoull(p_item, &endptr, 10);
 
-  // Did we use up all the characters?
-  if (*p) {
+  if (!has_only_spaces(endptr)) {
     *error = ERROR_INVALID_CHARS;
-    return 0;
+    result = 0;
+  } else if (errno == ERANGE || result > uint_max) {
+    *error = ERROR_OVERFLOW;
+    result = 0;
+  } else {
+    *error = 0;
   }
 
-  if (number > (uint64_t)int_max) {
+  if (result > (uint64_t)int_max) {
     state->seen_uint = 1;
   }
 
-  *error = 0;
-  return number;
+  free(processed_str);
+
+  return result;
 }

From 24593130034e4747ecea9291e83130ef6854ae74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 15:22:22 -0300
Subject: [PATCH 02/65] perf: use a local buffer to store the processed string

---
 .../_libs/include/pandas/parser/tokenizer.h   |  1 -
 pandas/_libs/parsers.pyx                      |  6 +-
 pandas/_libs/src/parser/tokenizer.c           | 77 ++++++++-----------
 3 files changed, 33 insertions(+), 51 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index 0d6bb6df3d123..209f375a5bf6c 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,7 +17,6 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
-#define ERROR_NO_MEMORY 4
 
 #include <stdint.h>
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 0c93d1c565f1c..442891949dfd2 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW, ERROR_NO_MEMORY
+    enum: ERROR_OVERFLOW
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -1822,8 +1822,6 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        if error == ERROR_NO_MEMORY:
-            raise MemoryError()
         return None
 
     if uint64_conflict(&state):
@@ -1894,8 +1892,6 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        if error == ERROR_NO_MEMORY:
-            raise MemoryError()
         return None, None
 
     return result, na_count
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 8a3dd7986c5e8..69c1c141e09cd 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -28,6 +28,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
+static const int PROCESSED_WORD_CAPACITY = 128;
+
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {
   // column i, starting at 0
@@ -1874,31 +1876,23 @@ static inline int has_only_spaces(const char *str) {
   return *str == '\0';
 }
 
-/* Copy a string without `char_to_remove`.
- * The returned memory should be free-d with a call to `free`.
+/* Copy a string without `char_to_remove` into `output`,
+ * while ensuring it's null terminated.
  */
-static char *copy_string_without_char(const char *str, char char_to_remove) {
-  size_t chars_to_copy = 0;
-  for (const char *src = str; *src != '\0'; src++) {
+static void copy_string_without_char(char *output, const char *str,
+                                     char char_to_remove, size_t output_size) {
+  size_t i = 0;
+  for (const char *src = str; *src != '\0' && i < output_size; src++) {
     if (*src != char_to_remove) {
-      chars_to_copy++;
+      output[i++] = *src;
     }
   }
-
-  char *start = malloc((chars_to_copy + 1) * sizeof(char));
-  if (!start) {
-    return NULL;
-  }
-
-  char *dst = start;
-  for (const char *src = str; *src != '\0'; src++) {
-    if (*src != char_to_remove) {
-      *dst++ = *src;
-    }
+  if (i < output_size) {
+    output[i] = '\0';
+  } else {
+    // str is too big, probably would overflow
+    errno = ERANGE;
   }
-  *dst = '\0';
-
-  return start;
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
@@ -1917,19 +1911,19 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     return 0;
   }
 
-  char *processed_str = NULL;
-
+  errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    processed_str = copy_string_without_char(p_item, tsep);
-    if (!processed_str) {
-      *error = ERROR_NO_MEMORY;
-      return 0;
-    }
-    p_item = processed_str;
+    char buffer[PROCESSED_WORD_CAPACITY];
+    copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
+    p_item = buffer;
+  }
+
+  if (errno == ERANGE) {
+    *error = ERROR_OVERFLOW;
+    return 0;
   }
 
   char *endptr = NULL;
-  errno = 0;
   int64_t result = strtoll(p_item, &endptr, 10);
 
   if (!has_only_spaces(endptr)) {
@@ -1944,11 +1938,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     *error = 0;
   }
 
-  // free processed_str that
-  // was either allocated due to the presence of tsep
-  // or is NULL
-  free(processed_str);
-
   return result;
 }
 
@@ -1977,18 +1966,18 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     return 0;
   }
 
-  char *processed_str = NULL;
-
+  errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    processed_str = copy_string_without_char(p_item, tsep);
-    if (!processed_str) {
-      *error = ERROR_NO_MEMORY;
-      return 0;
-    }
-    p_item = processed_str;
+    char buffer[PROCESSED_WORD_CAPACITY];
+    copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
+    p_item = buffer;
+  }
+
+  if (errno == ERANGE) {
+    *error = ERROR_OVERFLOW;
+    return 0;
   }
 
-  errno = 0;
   char *endptr = NULL;
   uint64_t result = strtoull(p_item, &endptr, 10);
 
@@ -2006,7 +1995,5 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     state->seen_uint = 1;
   }
 
-  free(processed_str);
-
   return result;
 }

From d8a454e7d21153cf2ef46dc095d7fe18b014b71d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 15:51:13 -0300
Subject: [PATCH 03/65] fix: use macro to fix MSVC build error

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 69c1c141e09cd..08805d7146549 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -28,7 +28,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
-static const int PROCESSED_WORD_CAPACITY = 128;
+#define PROCESSED_WORD_CAPACITY 128
 
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {

From 87789e6483b3e9f4cd037ad7f229d0dd54d68654 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 16:11:45 -0300
Subject: [PATCH 04/65] fix: use `bool`

---
 pandas/_libs/src/parser/tokenizer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 08805d7146549..9ef3c0c5c2197 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1844,9 +1844,9 @@ int uint64_conflict(uint_state *self) {
  * @param p_item Pointer to verify
  * @return Non-zero integer indicating that has a digit 0 otherwise.
  */
-static inline int has_digit_int(const char *str) {
+static inline bool has_digit_int(const char *str) {
   if (!str || *str == '\0') {
-    return 0;
+    return false;
   }
 
   switch (*str) {
@@ -1860,16 +1860,16 @@ static inline int has_digit_int(const char *str) {
   case '7':
   case '8':
   case '9':
-    return 1;
+    return true;
   case '+':
   case '-':
     return isdigit_ascii(str[1]);
   default:
-    return 0;
+    return false;
   }
 }
 
-static inline int has_only_spaces(const char *str) {
+static inline bool has_only_spaces(const char *str) {
   while (*str != '\0' && isspace_ascii(*str)) {
     str++;
   }

From 228794473ed04330ea8cf71c9b96603b9f2275ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 16:14:40 -0300
Subject: [PATCH 05/65] refactor: don't pass PROCESSED_WORD_CAPACITY as a
 separate argument

---
 pandas/_libs/src/parser/tokenizer.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 9ef3c0c5c2197..f94e97f51db83 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1879,15 +1879,16 @@ static inline bool has_only_spaces(const char *str) {
 /* Copy a string without `char_to_remove` into `output`,
  * while ensuring it's null terminated.
  */
-static void copy_string_without_char(char *output, const char *str,
-                                     char char_to_remove, size_t output_size) {
+static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
+                                     const char *str, char char_to_remove) {
   size_t i = 0;
-  for (const char *src = str; *src != '\0' && i < output_size; src++) {
+  for (const char *src = str; *src != '\0' && i < PROCESSED_WORD_CAPACITY;
+       src++) {
     if (*src != char_to_remove) {
       output[i++] = *src;
     }
   }
-  if (i < output_size) {
+  if (i < PROCESSED_WORD_CAPACITY) {
     output[i] = '\0';
   } else {
     // str is too big, probably would overflow
@@ -1914,7 +1915,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
     char buffer[PROCESSED_WORD_CAPACITY];
-    copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
+    copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;
   }
 
@@ -1969,7 +1970,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
     char buffer[PROCESSED_WORD_CAPACITY];
-    copy_string_without_char(buffer, p_item, tsep, PROCESSED_WORD_CAPACITY);
+    copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;
   }
 

From 2bea3c228d13c426f20cb9c40e4c97a8b4b7deec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 17:24:43 -0300
Subject: [PATCH 06/65] perf: write in chunks

---
 pandas/_libs/src/parser/tokenizer.c | 41 ++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index f94e97f51db83..77b97d7b421e8 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1877,22 +1877,37 @@ static inline bool has_only_spaces(const char *str) {
 }
 
 /* Copy a string without `char_to_remove` into `output`,
- * while ensuring it's null terminated.
+ * it assumes that output is filled with `\0`,
+ * so it won't null terminate the result.
  */
 static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
                                      const char *str, char char_to_remove) {
-  size_t i = 0;
-  for (const char *src = str; *src != '\0' && i < PROCESSED_WORD_CAPACITY;
-       src++) {
-    if (*src != char_to_remove) {
-      output[i++] = *src;
+  char *dst = output;
+  const char *src = str;
+  // last character is reserved for null terminator.
+  const char *end = output + PROCESSED_WORD_CAPACITY - 1;
+
+  while (*src != '\0' && dst < end) {
+    const char *next = src;
+    // find EOS or char_to_remove
+    while (*next != '\0' && *next != char_to_remove) {
+      next++;
     }
-  }
-  if (i < PROCESSED_WORD_CAPACITY) {
-    output[i] = '\0';
-  } else {
-    // str is too big, probably would overflow
-    errno = ERANGE;
+
+    size_t len = next - src;
+    if (dst + len > end) {
+      // Can't write here, str is too big
+      errno = ERANGE;
+      return;
+    }
+
+    // copy block
+    memcpy(dst, src, len);
+
+    // go to next available location to write
+    dst += len;
+    // Move past char to remove
+    src = *next == char_to_remove ? next + 1 : next;
   }
 }
 
@@ -1915,6 +1930,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
     char buffer[PROCESSED_WORD_CAPACITY];
+    memset(buffer, '\0', sizeof(buffer));
     copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;
   }
@@ -1970,6 +1986,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   errno = 0;
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
     char buffer[PROCESSED_WORD_CAPACITY];
+    memset(buffer, '\0', sizeof(buffer));
     copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;
   }

From fb386798e5b9c3848b8cb4981fbfa3f32efdccec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 17:40:28 -0300
Subject: [PATCH 07/65] hack: try bigger buffer size for arm error

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 77b97d7b421e8..cab072269fc46 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -28,7 +28,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
-#define PROCESSED_WORD_CAPACITY 128
+#define PROCESSED_WORD_CAPACITY 256
 
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {

From f9ede5c979a73b0af78609c8036234ec9498d50f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 20:09:06 -0300
Subject: [PATCH 08/65] fix: solution without manipulating the string

---
 pandas/_libs/src/parser/tokenizer.c | 119 ++++++++++++++++------------
 1 file changed, 68 insertions(+), 51 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index cab072269fc46..0475f1aec7938 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -21,6 +21,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 
 #include <ctype.h>
 #include <float.h>
+#include <limits.h>
 #include <math.h>
 #include <stdbool.h>
 #include <stdlib.h>
@@ -1876,39 +1877,51 @@ static inline bool has_only_spaces(const char *str) {
   return *str == '\0';
 }
 
-/* Copy a string without `char_to_remove` into `output`,
- * it assumes that output is filled with `\0`,
- * so it won't null terminate the result.
- */
-static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
-                                     const char *str, char char_to_remove) {
-  char *dst = output;
-  const char *src = str;
-  // last character is reserved for null terminator.
-  const char *end = output + PROCESSED_WORD_CAPACITY - 1;
-
-  while (*src != '\0' && dst < end) {
-    const char *next = src;
-    // find EOS or char_to_remove
-    while (*next != '\0' && *next != char_to_remove) {
-      next++;
+static inline int power_int(int base, int exponent) {
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
+  int result = 1;
+
+  while (exponent > 1) {
+    if (exponent % 2 == 1) {
+      result *= base;
+      exponent--;
     }
+    result *= result;
+    exponent /= 2;
+  }
+
+  return result * base;
+}
 
-    size_t len = next - src;
-    if (dst + len > end) {
-      // Can't write here, str is too big
+static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs,
+                                             int64_t mul_lhs) {
+  // rhs will always be positive, because this function
+  // only executes after the first parse, hence the sign will always go to lhs.
+  // if lhs > 0:
+  // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX
+  // iff lhs > (INT_MAX - rhs) / mul_lhs
+  // if lhs < 0:
+  // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN
+  // iff lhs < (INT_MIN + rhs) / mul_lhs
+  if (lhs >= 0) {
+    if (lhs > (INT_MAX - rhs) / mul_lhs) {
       errno = ERANGE;
-      return;
     }
+  } else {
+    if (lhs < (INT_MIN + rhs) / mul_lhs) {
+      errno = ERANGE;
+    }
+    rhs = -rhs;
+  }
+  return lhs * mul_lhs + rhs;
+}
 
-    // copy block
-    memcpy(dst, src, len);
-
-    // go to next available location to write
-    dst += len;
-    // Move past char to remove
-    src = *next == char_to_remove ? next + 1 : next;
+static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
+                                               uint64_t mul_lhs) {
+  if (lhs > (UINT_MAX - rhs) / mul_lhs) {
+    errno = ERANGE;
   }
+  return lhs * mul_lhs + rhs;
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
@@ -1928,21 +1941,23 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   errno = 0;
-  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    char buffer[PROCESSED_WORD_CAPACITY];
-    memset(buffer, '\0', sizeof(buffer));
-    copy_string_without_char(buffer, p_item, tsep);
-    p_item = buffer;
-  }
-
-  if (errno == ERANGE) {
-    *error = ERROR_OVERFLOW;
-    return 0;
-  }
-
   char *endptr = NULL;
   int64_t result = strtoll(p_item, &endptr, 10);
 
+  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
+    // Skip multiple consecutive tsep
+    while (*endptr == tsep) {
+      endptr++;
+    }
+
+    char *new_end = NULL;
+    int64_t next_part = strtoll(endptr, &new_end, 10);
+    int digits = new_end - endptr;
+    int mul_result = power_int(10, digits);
+    result = add_int_check_overflow(result, next_part, mul_result);
+    endptr = new_end;
+  }
+
   if (!has_only_spaces(endptr)) {
     // Check first for invalid characters because we may
     // want to skip integer parsing if we find one.
@@ -1984,21 +1999,23 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   errno = 0;
-  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    char buffer[PROCESSED_WORD_CAPACITY];
-    memset(buffer, '\0', sizeof(buffer));
-    copy_string_without_char(buffer, p_item, tsep);
-    p_item = buffer;
-  }
-
-  if (errno == ERANGE) {
-    *error = ERROR_OVERFLOW;
-    return 0;
-  }
-
   char *endptr = NULL;
   uint64_t result = strtoull(p_item, &endptr, 10);
 
+  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
+    // Skip multiple consecutive tsep
+    while (*endptr == tsep) {
+      endptr++;
+    }
+
+    char *new_end = NULL;
+    uint64_t next_part = strtoull(endptr, &new_end, 10);
+    int digits = new_end - endptr;
+    int mul_result = power_int(10, digits);
+    result = add_uint_check_overflow(result, next_part, mul_result);
+    endptr = new_end;
+  }
+
   if (!has_only_spaces(endptr)) {
     *error = ERROR_INVALID_CHARS;
     result = 0;

From 798c263f4e6bbb5158947c41a75261ed8ffd4a83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 20:23:24 -0300
Subject: [PATCH 09/65] some cleanup

---
 pandas/_libs/src/parser/tokenizer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 0475f1aec7938..f36607bffe5b8 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -29,8 +29,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
-#define PROCESSED_WORD_CAPACITY 256
-
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {
   // column i, starting at 0
@@ -1877,7 +1875,7 @@ static inline bool has_only_spaces(const char *str) {
   return *str == '\0';
 }
 
-static inline int power_int(int base, int exponent) {
+static int power_int(int base, int exponent) {
   // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
   int result = 1;
 
@@ -1953,7 +1951,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     char *new_end = NULL;
     int64_t next_part = strtoll(endptr, &new_end, 10);
     int digits = new_end - endptr;
-    int mul_result = power_int(10, digits);
+    int64_t mul_result = power_int(10, digits);
     result = add_int_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }
@@ -2011,7 +2009,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     char *new_end = NULL;
     uint64_t next_part = strtoull(endptr, &new_end, 10);
     int digits = new_end - endptr;
-    int mul_result = power_int(10, digits);
+    uint64_t mul_result = power_int(10, digits);
     result = add_uint_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }

From 2f06f192e70e28200499160e4e58c242f9eb2476 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 20:33:44 -0300
Subject: [PATCH 10/65] fix: use ptrdiff_t to fix MSVC build error

---
 pandas/_libs/src/parser/tokenizer.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index f36607bffe5b8..52ade83592702 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -24,6 +24,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <limits.h>
 #include <math.h>
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdlib.h>
 
 #include "pandas/portable.h"
@@ -1950,8 +1951,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
     char *new_end = NULL;
     int64_t next_part = strtoll(endptr, &new_end, 10);
-    int digits = new_end - endptr;
-    int64_t mul_result = power_int(10, digits);
+    ptrdiff_t digits = new_end - endptr;
+    int64_t mul_result = power_int(10, (int)digits);
     result = add_int_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }
@@ -2008,8 +2009,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
     char *new_end = NULL;
     uint64_t next_part = strtoull(endptr, &new_end, 10);
-    int digits = new_end - endptr;
-    uint64_t mul_result = power_int(10, digits);
+    ptrdiff_t digits = new_end - endptr;
+    uint64_t mul_result = power_int(10, (int)digits);
     result = add_uint_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }

From 280b55e22b24b47109e1d90a2125c9dc22e81057 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Mon, 13 Oct 2025 22:15:18 -0300
Subject: [PATCH 11/65] add other exponent cases for completion

---
 pandas/_libs/src/parser/tokenizer.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 52ade83592702..c4409bbf0803f 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1878,6 +1878,12 @@ static inline bool has_only_spaces(const char *str) {
 
 static int power_int(int base, int exponent) {
   // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
+  if (exponent == 0) {
+    return 1;
+  } else if (exponent < 0) {
+    return 0;
+  }
+
   int result = 1;
 
   while (exponent > 1) {

From d85aaf02bd85f12459ca7dd0039f4a9a1ee468b5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 09:29:54 -0300
Subject: [PATCH 12/65] fix: use builtin overflow check verification

---
 pandas/_libs/src/parser/tokenizer.c | 59 +++++++++++++----------------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index c4409bbf0803f..8d3608ee81afa 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -24,6 +24,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <limits.h>
 #include <math.h>
 #include <stdbool.h>
+#include <stdckdint.h>
 #include <stddef.h>
 #include <stdlib.h>
 
@@ -1898,37 +1899,6 @@ static int power_int(int base, int exponent) {
   return result * base;
 }
 
-static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs,
-                                             int64_t mul_lhs) {
-  // rhs will always be positive, because this function
-  // only executes after the first parse, hence the sign will always go to lhs.
-  // if lhs > 0:
-  // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX
-  // iff lhs > (INT_MAX - rhs) / mul_lhs
-  // if lhs < 0:
-  // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN
-  // iff lhs < (INT_MIN + rhs) / mul_lhs
-  if (lhs >= 0) {
-    if (lhs > (INT_MAX - rhs) / mul_lhs) {
-      errno = ERANGE;
-    }
-  } else {
-    if (lhs < (INT_MIN + rhs) / mul_lhs) {
-      errno = ERANGE;
-    }
-    rhs = -rhs;
-  }
-  return lhs * mul_lhs + rhs;
-}
-
-static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
-                                               uint64_t mul_lhs) {
-  if (lhs > (UINT_MAX - rhs) / mul_lhs) {
-    errno = ERANGE;
-  }
-  return lhs * mul_lhs + rhs;
-}
-
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
   if (!p_item || *p_item == '\0') {
@@ -1948,6 +1918,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   char *endptr = NULL;
   int64_t result = strtoll(p_item, &endptr, 10);
+  bool is_negative = result < 0;
 
   while (errno == 0 && tsep != '\0' && *endptr == tsep) {
     // Skip multiple consecutive tsep
@@ -1957,9 +1928,22 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
     char *new_end = NULL;
     int64_t next_part = strtoll(endptr, &new_end, 10);
+    if (is_negative) {
+      next_part = -next_part;
+    }
+
     ptrdiff_t digits = new_end - endptr;
     int64_t mul_result = power_int(10, (int)digits);
-    result = add_int_check_overflow(result, next_part, mul_result);
+    // result * mul_result
+    if (ckd_mul(&result, result, mul_result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (ckd_add(&result, result, next_part)) {
+      // overflow
+      errno = ERANGE;
+    }
     endptr = new_end;
   }
 
@@ -2017,7 +2001,16 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     uint64_t next_part = strtoull(endptr, &new_end, 10);
     ptrdiff_t digits = new_end - endptr;
     uint64_t mul_result = power_int(10, (int)digits);
-    result = add_uint_check_overflow(result, next_part, mul_result);
+    // result * mul_result
+    if (ckd_mul(&result, result, mul_result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (ckd_add(&result, result, next_part)) {
+      // overflow
+      errno = ERANGE;
+    }
     endptr = new_end;
   }
 

From 9046eccb8e49c945ac9f6184fa8a9a54944372e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 09:34:47 -0300
Subject: [PATCH 13/65] fix: change std to c2x

---
 meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/meson.build b/meson.build
index 6a00e52481108..c774be30d8562 100644
--- a/meson.build
+++ b/meson.build
@@ -7,7 +7,7 @@ project(
     version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(),
     license: 'BSD-3',
     meson_version: '>=1.2.1',
-    default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'],
+    default_options: ['buildtype=release', 'c_std=c2x', 'warning_level=2'],
 )
 
 fs = import('fs')

From a4e2fb8ec720e8bb363a5832611cd02cd2272e2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 09:45:59 -0300
Subject: [PATCH 14/65] Revert previous commits

Revert "fix: change std to c2x"

This reverts commit 9046eccb8e49c945ac9f6184fa8a9a54944372e7.

Revert "fix: use builtin overflow check verification"

This reverts commit d85aaf02bd85f12459ca7dd0039f4a9a1ee468b5.
---
 meson.build                         |  2 +-
 pandas/_libs/src/parser/tokenizer.c | 59 ++++++++++++++++-------------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/meson.build b/meson.build
index c774be30d8562..6a00e52481108 100644
--- a/meson.build
+++ b/meson.build
@@ -7,7 +7,7 @@ project(
     version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(),
     license: 'BSD-3',
     meson_version: '>=1.2.1',
-    default_options: ['buildtype=release', 'c_std=c2x', 'warning_level=2'],
+    default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'],
 )
 
 fs = import('fs')
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 8d3608ee81afa..c4409bbf0803f 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -24,7 +24,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include <limits.h>
 #include <math.h>
 #include <stdbool.h>
-#include <stdckdint.h>
 #include <stddef.h>
 #include <stdlib.h>
 
@@ -1899,6 +1898,37 @@ static int power_int(int base, int exponent) {
   return result * base;
 }
 
+static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs,
+                                             int64_t mul_lhs) {
+  // rhs will always be positive, because this function
+  // only executes after the first parse, hence the sign will always go to lhs.
+  // if lhs > 0:
+  // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX
+  // iff lhs > (INT_MAX - rhs) / mul_lhs
+  // if lhs < 0:
+  // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN
+  // iff lhs < (INT_MIN + rhs) / mul_lhs
+  if (lhs >= 0) {
+    if (lhs > (INT_MAX - rhs) / mul_lhs) {
+      errno = ERANGE;
+    }
+  } else {
+    if (lhs < (INT_MIN + rhs) / mul_lhs) {
+      errno = ERANGE;
+    }
+    rhs = -rhs;
+  }
+  return lhs * mul_lhs + rhs;
+}
+
+static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
+                                               uint64_t mul_lhs) {
+  if (lhs > (UINT_MAX - rhs) / mul_lhs) {
+    errno = ERANGE;
+  }
+  return lhs * mul_lhs + rhs;
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
   if (!p_item || *p_item == '\0') {
@@ -1918,7 +1948,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   char *endptr = NULL;
   int64_t result = strtoll(p_item, &endptr, 10);
-  bool is_negative = result < 0;
 
   while (errno == 0 && tsep != '\0' && *endptr == tsep) {
     // Skip multiple consecutive tsep
@@ -1928,22 +1957,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
     char *new_end = NULL;
     int64_t next_part = strtoll(endptr, &new_end, 10);
-    if (is_negative) {
-      next_part = -next_part;
-    }
-
     ptrdiff_t digits = new_end - endptr;
     int64_t mul_result = power_int(10, (int)digits);
-    // result * mul_result
-    if (ckd_mul(&result, result, mul_result)) {
-      // overflow
-      errno = ERANGE;
-    }
-    // result + next_part
-    if (ckd_add(&result, result, next_part)) {
-      // overflow
-      errno = ERANGE;
-    }
+    result = add_int_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }
 
@@ -2001,16 +2017,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     uint64_t next_part = strtoull(endptr, &new_end, 10);
     ptrdiff_t digits = new_end - endptr;
     uint64_t mul_result = power_int(10, (int)digits);
-    // result * mul_result
-    if (ckd_mul(&result, result, mul_result)) {
-      // overflow
-      errno = ERANGE;
-    }
-    // result + next_part
-    if (ckd_add(&result, result, next_part)) {
-      // overflow
-      errno = ERANGE;
-    }
+    result = add_uint_check_overflow(result, next_part, mul_result);
     endptr = new_end;
   }
 

From ef82cf4b218aa7df99ac277f88b14a94ad596a87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 09:58:39 -0300
Subject: [PATCH 15/65] refactor: move overflow check to header

---
 .../vendored/numpy/datetime/np_datetime.h     | 29 +++++++++++++++++++
 .../src/vendored/numpy/datetime/np_datetime.c | 29 -------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
index e4e90a7ea24cf..6ae24bf58cf24 100644
--- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
+++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
@@ -20,6 +20,35 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #endif // NPY_NO_DEPRECATED_API
 
+#if defined(_WIN32)
+#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#endif
+#include <intsafe.h>
+#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
+#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
+#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#else
+#if defined __has_builtin
+#if __has_builtin(__builtin_add_overflow)
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0,
+               "Overflow checking not detected; please try a newer compiler");
+#endif
+// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
+// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
+#elif __GNUC__ > 7
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
+#endif
+#endif
+
 #include <numpy/ndarraytypes.h>
 
 typedef struct {
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
index 9a022095feee9..eab58e915e247 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -27,35 +27,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #include <numpy/npy_common.h>
 #include <stdbool.h>
 
-#if defined(_WIN32)
-#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#endif
-#include <intsafe.h>
-#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
-#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
-#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
-#else
-#if defined __has_builtin
-#if __has_builtin(__builtin_add_overflow)
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0,
-               "Overflow checking not detected; please try a newer compiler");
-#endif
-// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
-// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
-#elif __GNUC__ > 7
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
-#endif
-#endif
-
 #define XSTR(a) STR(a)
 #define STR(a) #a
 

From 5afeb1101709bdd5d66678d439bbde7ade7a70f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 09:59:02 -0300
Subject: [PATCH 16/65] refactor: use overflow check from numpy

---
 pandas/_libs/src/parser/tokenizer.c | 35 +++++++++--------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index c4409bbf0803f..4dac197596af5 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -18,6 +18,7 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 */
 #include "pandas/parser/tokenizer.h"
 #include "pandas/portable.h"
+#include "pandas/vendored/numpy/datetime/np_datetime.h"
 
 #include <ctype.h>
 #include <float.h>
@@ -1898,29 +1899,6 @@ static int power_int(int base, int exponent) {
   return result * base;
 }
 
-static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs,
-                                             int64_t mul_lhs) {
-  // rhs will always be positive, because this function
-  // only executes after the first parse, hence the sign will always go to lhs.
-  // if lhs > 0:
-  // Will overflow if (mul_lhs * lhs) + rhs > INT_MAX
-  // iff lhs > (INT_MAX - rhs) / mul_lhs
-  // if lhs < 0:
-  // Will underflow if (mul_lhs * lhs) - rhs < INT_MIN
-  // iff lhs < (INT_MIN + rhs) / mul_lhs
-  if (lhs >= 0) {
-    if (lhs > (INT_MAX - rhs) / mul_lhs) {
-      errno = ERANGE;
-    }
-  } else {
-    if (lhs < (INT_MIN + rhs) / mul_lhs) {
-      errno = ERANGE;
-    }
-    rhs = -rhs;
-  }
-  return lhs * mul_lhs + rhs;
-}
-
 static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
                                                uint64_t mul_lhs) {
   if (lhs > (UINT_MAX - rhs) / mul_lhs) {
@@ -1959,7 +1937,16 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     int64_t next_part = strtoll(endptr, &new_end, 10);
     ptrdiff_t digits = new_end - endptr;
     int64_t mul_result = power_int(10, (int)digits);
-    result = add_int_check_overflow(result, next_part, mul_result);
+    // result * mul_result
+    if (checked_int64_mul(result, mul_result, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (checked_int64_add(result, next_part, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
     endptr = new_end;
   }
 

From 0ef47a7b3acdc582a1ee507cab7b035b5d625e5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 10:30:37 -0300
Subject: [PATCH 17/65] fix: handle negative check

---
 pandas/_libs/src/parser/tokenizer.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 4dac197596af5..2d7fb19aa8800 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1926,6 +1926,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   char *endptr = NULL;
   int64_t result = strtoll(p_item, &endptr, 10);
+  bool is_negative = result < 0;
 
   while (errno == 0 && tsep != '\0' && *endptr == tsep) {
     // Skip multiple consecutive tsep
@@ -1935,6 +1936,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
     char *new_end = NULL;
     int64_t next_part = strtoll(endptr, &new_end, 10);
+    if (is_negative) {
+      next_part = -next_part;
+    }
+
     ptrdiff_t digits = new_end - endptr;
     int64_t mul_result = power_int(10, (int)digits);
     // result * mul_result

From c840ef01037809aba092556e08a8211113c0ac53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 10:34:39 -0300
Subject: [PATCH 18/65] fix: add test for thousand separator with negative
 number

---
 pandas/tests/io/parser/common/test_common_basic.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 3680273f5e98a..487520b7a9359 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -72,13 +72,16 @@ def test_read_csv_local(all_parsers, csv1):
     tm.assert_frame_equal(result, expected)
 
 
-def test_1000_sep(all_parsers):
+@pytest.mark.parametrize(
+    "number_csv, expected_number", [("2,334", 2334), ("-2,334", -2334)]
+)
+def test_1000_sep(all_parsers, number_csv, expected_number):
     parser = all_parsers
-    data = """A|B|C
-1|2,334|5
+    data = f"""A|B|C
+1|{number_csv}|5
 10|13|10.
 """
-    expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
+    expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]})
 
     if parser.engine == "pyarrow":
         msg = "The 'thousands' option is not supported with the 'pyarrow' engine"

From 132342b1f73b2956ffb9ab59826c78b436d693a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 10:37:32 -0300
Subject: [PATCH 19/65] move to portable

---
 pandas/_libs/include/pandas/portable.h        | 29 +++++++++++++++++++
 .../vendored/numpy/datetime/np_datetime.h     | 29 -------------------
 pandas/_libs/src/parser/tokenizer.c           |  1 -
 .../src/vendored/numpy/datetime/np_datetime.c |  1 +
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
index 1d0509d9e9724..f9f1e96d7dc7a 100644
--- a/pandas/_libs/include/pandas/portable.h
+++ b/pandas/_libs/include/pandas/portable.h
@@ -35,3 +35,32 @@ The full license is in the LICENSE file, distributed with this software.
   do {                                                                         \
   } while (0) /* fallthrough */
 #endif
+
+#if defined(_WIN32)
+#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#endif
+#include <intsafe.h>
+#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
+#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
+#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#else
+#if defined __has_builtin
+#if __has_builtin(__builtin_add_overflow)
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0,
+               "Overflow checking not detected; please try a newer compiler");
+#endif
+// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
+// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
+#elif __GNUC__ > 7
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
+#endif
+#endif
diff --git a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
index 6ae24bf58cf24..e4e90a7ea24cf 100644
--- a/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
+++ b/pandas/_libs/include/pandas/vendored/numpy/datetime/np_datetime.h
@@ -20,35 +20,6 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #endif // NPY_NO_DEPRECATED_API
 
-#if defined(_WIN32)
-#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#endif
-#include <intsafe.h>
-#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
-#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
-#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
-#else
-#if defined __has_builtin
-#if __has_builtin(__builtin_add_overflow)
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0,
-               "Overflow checking not detected; please try a newer compiler");
-#endif
-// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
-// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
-#elif __GNUC__ > 7
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
-#endif
-#endif
-
 #include <numpy/ndarraytypes.h>
 
 typedef struct {
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 2d7fb19aa8800..4884964fef0f2 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -18,7 +18,6 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 */
 #include "pandas/parser/tokenizer.h"
 #include "pandas/portable.h"
-#include "pandas/vendored/numpy/datetime/np_datetime.h"
 
 #include <ctype.h>
 #include <float.h>
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
index eab58e915e247..043fa033df272 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -21,6 +21,7 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #endif // NPY_NO_DEPRECATED_API
 
 #include "pandas/vendored/numpy/datetime/np_datetime.h"
+#include "pandas/portable.h"
 #define NO_IMPORT_ARRAY
 #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
 #include <numpy/ndarrayobject.h>

From e6977cc64033629fb6389e59db417385e226a377 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 10:46:45 -0300
Subject: [PATCH 20/65] perf: use builtin unsigned long overflow check

---
 pandas/_libs/include/pandas/portable.h |  6 ++++++
 pandas/_libs/src/parser/tokenizer.c    | 21 ++++++++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
index f9f1e96d7dc7a..af5a821f890fd 100644
--- a/pandas/_libs/include/pandas/portable.h
+++ b/pandas/_libs/include/pandas/portable.h
@@ -44,12 +44,18 @@ The full license is in the LICENSE file, distributed with this software.
 #define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
 #define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
 #define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#define checked_uint64_add(a, b, res) ULongLongAdd(a, b, res)
+#define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res)
+#define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res)
 #else
 #if defined __has_builtin
 #if __has_builtin(__builtin_add_overflow)
 #define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
 #define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
 #define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#define checked_uint64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_uint64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_uint64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
 #else
 _Static_assert(0,
                "Overflow checking not detected; please try a newer compiler");
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 4884964fef0f2..812b396f12344 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1898,14 +1898,6 @@ static int power_int(int base, int exponent) {
   return result * base;
 }
 
-static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
-                                               uint64_t mul_lhs) {
-  if (lhs > (UINT_MAX - rhs) / mul_lhs) {
-    errno = ERANGE;
-  }
-  return lhs * mul_lhs + rhs;
-}
-
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
   if (!p_item || *p_item == '\0') {
@@ -2008,7 +2000,18 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     uint64_t next_part = strtoull(endptr, &new_end, 10);
     ptrdiff_t digits = new_end - endptr;
     uint64_t mul_result = power_int(10, (int)digits);
-    result = add_uint_check_overflow(result, next_part, mul_result);
+
+    // result * mul_result
+    if (checked_uint64_mul(result, mul_result, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (checked_uint64_add(result, next_part, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+
     endptr = new_end;
   }
 

From 8120eea51c18ecef546bd1d58f6992cedd6dbd74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:04:16 -0300
Subject: [PATCH 21/65] refactor: combine builting and gnuc branches

---
 pandas/_libs/include/pandas/portable.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
index af5a821f890fd..b462f1e8ca0e6 100644
--- a/pandas/_libs/include/pandas/portable.h
+++ b/pandas/_libs/include/pandas/portable.h
@@ -48,8 +48,8 @@ The full license is in the LICENSE file, distributed with this software.
 #define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res)
 #define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res)
 #else
-#if defined __has_builtin
-#if __has_builtin(__builtin_add_overflow)
+#if (defined __has_builtin && __has_builtin(__builtin_add_overflow)) ||        \
+    __GNUC__ > 7
 #define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
 #define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
 #define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
@@ -60,13 +60,4 @@ The full license is in the LICENSE file, distributed with this software.
 _Static_assert(0,
                "Overflow checking not detected; please try a newer compiler");
 #endif
-// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
-// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
-#elif __GNUC__ > 7
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
-#endif
 #endif

From 1f5d506c9a1bb5baa1d685c951ab658c6504edfb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:19:30 -0300
Subject: [PATCH 22/65] don't assign null

---
 pandas/_libs/src/parser/tokenizer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 812b396f12344..2ff5fb3b4116c 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1925,7 +1925,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
       endptr++;
     }
 
-    char *new_end = NULL;
+    char *new_end;
     int64_t next_part = strtoll(endptr, &new_end, 10);
     if (is_negative) {
       next_part = -next_part;
@@ -1996,7 +1996,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
       endptr++;
     }
 
-    char *new_end = NULL;
+    char *new_end;
     uint64_t next_part = strtoull(endptr, &new_end, 10);
     ptrdiff_t digits = new_end - endptr;
     uint64_t mul_result = power_int(10, (int)digits);

From 479a2abeccc0f7a870a539baa09caeff05c5879d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:27:16 -0300
Subject: [PATCH 23/65] fix: perform bound check

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 2ff5fb3b4116c..72b7fa8bb9154 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1863,7 +1863,7 @@ static inline bool has_digit_int(const char *str) {
     return true;
   case '+':
   case '-':
-    return isdigit_ascii(str[1]);
+    return str[1] != '\0' && isdigit_ascii(str[1]);
   default:
     return false;
   }

From c37c35500213c539e5baeb4a77f14fe8b3f8acac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:44:41 -0300
Subject: [PATCH 24/65] fix: assign error if doesn't have a digit after tsep

It stills permits ending in tsep
---
 pandas/_libs/src/parser/tokenizer.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 72b7fa8bb9154..5508429e78f05 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1920,9 +1920,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   bool is_negative = result < 0;
 
   while (errno == 0 && tsep != '\0' && *endptr == tsep) {
-    // Skip multiple consecutive tsep
-    while (*endptr == tsep) {
-      endptr++;
+    // move after tsep
+    endptr++;
+    if (*endptr == '\0' || !isdigit_ascii(*endptr)) {
+      // stop parsing and let the remaining of the function
+      // assign an error code
+      break;
     }
 
     char *new_end;
@@ -1991,9 +1994,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   uint64_t result = strtoull(p_item, &endptr, 10);
 
   while (errno == 0 && tsep != '\0' && *endptr == tsep) {
-    // Skip multiple consecutive tsep
-    while (*endptr == tsep) {
-      endptr++;
+    // move after tsep
+    endptr++;
+    if (*endptr == '\0' || !isdigit_ascii(*endptr)) {
+      // stop parsing and let the remaining of the function
+      // assign an error code
+      break;
     }
 
     char *new_end;

From 7d552833f30e9c6026c44099b90bfd6c57653b9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:51:24 -0300
Subject: [PATCH 25/65] fix: go back to buffer solution

---
 pandas/_libs/src/parser/tokenizer.c | 134 ++++++++++++----------------
 1 file changed, 56 insertions(+), 78 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 5508429e78f05..d947cffe7e38c 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -21,15 +21,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 
 #include <ctype.h>
 #include <float.h>
-#include <limits.h>
 #include <math.h>
 #include <stdbool.h>
-#include <stddef.h>
 #include <stdlib.h>
 
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
+#define PROCESSED_WORD_CAPACITY 128
+
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
                    int64_t start) {
   // column i, starting at 0
@@ -1876,26 +1876,39 @@ static inline bool has_only_spaces(const char *str) {
   return *str == '\0';
 }
 
-static int power_int(int base, int exponent) {
-  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
-  if (exponent == 0) {
-    return 1;
-  } else if (exponent < 0) {
-    return 0;
-  }
-
-  int result = 1;
+/* Copy a string without `char_to_remove` into `output`,
+ * it assumes that output is filled with `\0`,
+ * so it won't null terminate the result.
+ */
+static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
+                                     const char *str, char char_to_remove) {
+  char *dst = output;
+  const char *src = str;
+  // last character is reserved for null terminator.
+  const char *end = output + PROCESSED_WORD_CAPACITY - 1;
+
+  while (*src != '\0' && dst < end) {
+    const char *next = src;
+    // find EOS or char_to_remove
+    while (*next != '\0' && *next != char_to_remove) {
+      next++;
+    }
 
-  while (exponent > 1) {
-    if (exponent % 2 == 1) {
-      result *= base;
-      exponent--;
+    size_t len = next - src;
+    if (dst + len > end) {
+      // Can't write here, str is too big
+      errno = ERANGE;
+      return;
     }
-    result *= result;
-    exponent /= 2;
-  }
 
-  return result * base;
+    // copy block
+    memcpy(dst, src, len);
+
+    // go to next available location to write
+    dst += len;
+    // Move past char to remove
+    src = *next == char_to_remove ? next + 1 : next;
+  }
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
@@ -1915,40 +1928,21 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   errno = 0;
-  char *endptr = NULL;
-  int64_t result = strtoll(p_item, &endptr, 10);
-  bool is_negative = result < 0;
-
-  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
-    // move after tsep
-    endptr++;
-    if (*endptr == '\0' || !isdigit_ascii(*endptr)) {
-      // stop parsing and let the remaining of the function
-      // assign an error code
-      break;
-    }
-
-    char *new_end;
-    int64_t next_part = strtoll(endptr, &new_end, 10);
-    if (is_negative) {
-      next_part = -next_part;
-    }
+  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
+    char buffer[PROCESSED_WORD_CAPACITY];
+    memset(buffer, '\0', sizeof(buffer));
+    copy_string_without_char(buffer, p_item, tsep);
+    p_item = buffer;
+  }
 
-    ptrdiff_t digits = new_end - endptr;
-    int64_t mul_result = power_int(10, (int)digits);
-    // result * mul_result
-    if (checked_int64_mul(result, mul_result, &result)) {
-      // overflow
-      errno = ERANGE;
-    }
-    // result + next_part
-    if (checked_int64_add(result, next_part, &result)) {
-      // overflow
-      errno = ERANGE;
-    }
-    endptr = new_end;
+  if (errno == ERANGE) {
+    *error = ERROR_OVERFLOW;
+    return 0;
   }
 
+  char *endptr = NULL;
+  int64_t result = strtoll(p_item, &endptr, 10);
+
   if (!has_only_spaces(endptr)) {
     // Check first for invalid characters because we may
     // want to skip integer parsing if we find one.
@@ -1990,37 +1984,21 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   errno = 0;
-  char *endptr = NULL;
-  uint64_t result = strtoull(p_item, &endptr, 10);
-
-  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
-    // move after tsep
-    endptr++;
-    if (*endptr == '\0' || !isdigit_ascii(*endptr)) {
-      // stop parsing and let the remaining of the function
-      // assign an error code
-      break;
-    }
-
-    char *new_end;
-    uint64_t next_part = strtoull(endptr, &new_end, 10);
-    ptrdiff_t digits = new_end - endptr;
-    uint64_t mul_result = power_int(10, (int)digits);
-
-    // result * mul_result
-    if (checked_uint64_mul(result, mul_result, &result)) {
-      // overflow
-      errno = ERANGE;
-    }
-    // result + next_part
-    if (checked_uint64_add(result, next_part, &result)) {
-      // overflow
-      errno = ERANGE;
-    }
+  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
+    char buffer[PROCESSED_WORD_CAPACITY];
+    memset(buffer, '\0', sizeof(buffer));
+    copy_string_without_char(buffer, p_item, tsep);
+    p_item = buffer;
+  }
 
-    endptr = new_end;
+  if (errno == ERANGE) {
+    *error = ERROR_OVERFLOW;
+    return 0;
   }
 
+  char *endptr = NULL;
+  uint64_t result = strtoull(p_item, &endptr, 10);
+
   if (!has_only_spaces(endptr)) {
     *error = ERROR_INVALID_CHARS;
     result = 0;

From ffe50cee1028a87f2061135e506baabe2268b4e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 11:51:50 -0300
Subject: [PATCH 26/65] refactor: undo refactor in np_datetime.c

---
 pandas/_libs/include/pandas/portable.h        | 26 ----------------
 .../src/vendored/numpy/datetime/np_datetime.c | 30 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
index b462f1e8ca0e6..1d0509d9e9724 100644
--- a/pandas/_libs/include/pandas/portable.h
+++ b/pandas/_libs/include/pandas/portable.h
@@ -35,29 +35,3 @@ The full license is in the LICENSE file, distributed with this software.
   do {                                                                         \
   } while (0) /* fallthrough */
 #endif
-
-#if defined(_WIN32)
-#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
-#endif
-#include <intsafe.h>
-#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
-#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
-#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
-#define checked_uint64_add(a, b, res) ULongLongAdd(a, b, res)
-#define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res)
-#define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res)
-#else
-#if (defined __has_builtin && __has_builtin(__builtin_add_overflow)) ||        \
-    __GNUC__ > 7
-#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#define checked_uint64_add(a, b, res) __builtin_add_overflow(a, b, res)
-#define checked_uint64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
-#define checked_uint64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
-#else
-_Static_assert(0,
-               "Overflow checking not detected; please try a newer compiler");
-#endif
-#endif
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
index 043fa033df272..9a022095feee9 100644
--- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
+++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -21,13 +21,41 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #endif // NPY_NO_DEPRECATED_API
 
 #include "pandas/vendored/numpy/datetime/np_datetime.h"
-#include "pandas/portable.h"
 #define NO_IMPORT_ARRAY
 #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
 #include <numpy/ndarrayobject.h>
 #include <numpy/npy_common.h>
 #include <stdbool.h>
 
+#if defined(_WIN32)
+#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#endif
+#include <intsafe.h>
+#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
+#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
+#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#else
+#if defined __has_builtin
+#if __has_builtin(__builtin_add_overflow)
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0,
+               "Overflow checking not detected; please try a newer compiler");
+#endif
+// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
+// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
+#elif __GNUC__ > 7
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
+#endif
+#endif
+
 #define XSTR(a) STR(a)
 #define STR(a) #a
 

From af5ad7120c5271ed0e50d1e60d9ff8478341cc1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 12:08:39 -0300
Subject: [PATCH 27/65] fix: fix undefined behavior

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index d947cffe7e38c..28d901f5f5a6a 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1928,8 +1928,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   errno = 0;
+  char buffer[PROCESSED_WORD_CAPACITY];
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    char buffer[PROCESSED_WORD_CAPACITY];
     memset(buffer, '\0', sizeof(buffer));
     copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;

From 92117041c200fe33f446e7bbc77c63d034e0a757 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 12:55:36 -0300
Subject: [PATCH 28/65] fix: fix leftover undefined behaviour

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 28d901f5f5a6a..514482055c64d 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1984,8 +1984,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   errno = 0;
+  char buffer[PROCESSED_WORD_CAPACITY];
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    char buffer[PROCESSED_WORD_CAPACITY];
     memset(buffer, '\0', sizeof(buffer));
     copy_string_without_char(buffer, p_item, tsep);
     p_item = buffer;

From d026b016d4e550c70f378671212ce8f29d7e8a6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 13:26:22 -0300
Subject: [PATCH 29/65] rewrite `copy_string_without_char`

- Returns status code
- Simplify loop for a more common sliding window
- use string length
---
 .../_libs/include/pandas/parser/tokenizer.h   |  1 +
 pandas/_libs/src/parser/tokenizer.c           | 81 +++++++++++--------
 2 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index 209f375a5bf6c..141d883220f1e 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,6 +17,7 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
+#define ERROR_WORD2BIG 4
 
 #include <stdint.h>
 
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 514482055c64d..e416a334a244b 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1876,39 +1876,48 @@ static inline bool has_only_spaces(const char *str) {
   return *str == '\0';
 }
 
-/* Copy a string without `char_to_remove` into `output`,
- * it assumes that output is filled with `\0`,
- * so it won't null terminate the result.
+/* Copy a string without `char_to_remove` into `output`.
  */
-static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
-                                     const char *str, char char_to_remove) {
-  char *dst = output;
-  const char *src = str;
+static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
+                                    const char *str, size_t str_len,
+                                    char char_to_remove) {
   // last character is reserved for null terminator.
-  const char *end = output + PROCESSED_WORD_CAPACITY - 1;
-
-  while (*src != '\0' && dst < end) {
-    const char *next = src;
-    // find EOS or char_to_remove
-    while (*next != '\0' && *next != char_to_remove) {
-      next++;
+  size_t max_str_size = PROCESSED_WORD_CAPACITY - 1;
+  if (str_len > max_str_size) {
+    // str_len is too big.
+    // Check if it's possible to write after removing all `char_to_remove`.
+    size_t count_char_to_remove = 0;
+    for (const char *src = str; *src != '\0'; src++) {
+      if (*src == char_to_remove) {
+        count_char_to_remove++;
+      }
     }
 
-    size_t len = next - src;
-    if (dst + len > end) {
-      // Can't write here, str is too big
-      errno = ERANGE;
-      return;
+    if (str_len - count_char_to_remove > max_str_size) {
+      return ERROR_WORD2BIG;
     }
+  }
+
+  char *dst = output;
+  const char *left = str;
+
+  // sliding window
+  for (const char *right = str; *left != '\0'; right++) {
+    if (*right == '\0' || *right == char_to_remove) {
+      size_t len = right - left;
 
-    // copy block
-    memcpy(dst, src, len);
+      // copy block
+      memcpy(dst, left, len);
 
-    // go to next available location to write
-    dst += len;
-    // Move past char to remove
-    src = *next == char_to_remove ? next + 1 : next;
+      // go to next available location to write
+      dst += len;
+      left = *right == '\0' ? right : right + 1;
+    }
   }
+
+  // null terminate
+  *dst = '\0';
+  return 0;
 }
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
@@ -1930,8 +1939,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    memset(buffer, '\0', sizeof(buffer));
-    copy_string_without_char(buffer, p_item, tsep);
+    int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
+
+    if (status != 0) {
+      *error = status;
+      return 0;
+    }
+
     p_item = buffer;
   }
 
@@ -1986,14 +2000,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
   if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    memset(buffer, '\0', sizeof(buffer));
-    copy_string_without_char(buffer, p_item, tsep);
-    p_item = buffer;
-  }
+    int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
 
-  if (errno == ERANGE) {
-    *error = ERROR_OVERFLOW;
-    return 0;
+    if (status != 0) {
+      *error = status;
+      return 0;
+    }
+    p_item = buffer;
   }
 
   char *endptr = NULL;

From d76ff5f01bcda2cc3f6949a52dda40fb17fe0255 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 17:19:50 -0300
Subject: [PATCH 30/65] fix: change solution to safe guard against end_ptr

---
 pandas/_libs/src/parser/tokenizer.c | 53 ++++++++++++++---------------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index e416a334a244b..a1457e2e385c7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1881,42 +1881,41 @@ static inline bool has_only_spaces(const char *str) {
 static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
                                     const char *str, size_t str_len,
                                     char char_to_remove) {
-  // last character is reserved for null terminator.
-  size_t max_str_size = PROCESSED_WORD_CAPACITY - 1;
-  if (str_len > max_str_size) {
-    // str_len is too big.
-    // Check if it's possible to write after removing all `char_to_remove`.
-    size_t count_char_to_remove = 0;
-    for (const char *src = str; *src != '\0'; src++) {
-      if (*src == char_to_remove) {
-        count_char_to_remove++;
-      }
-    }
+  const char *left = str;
+  const char *right;
+  const char *end_ptr = str + str_len;
+  size_t bytes_read = 0;
 
-    if (str_len - count_char_to_remove > max_str_size) {
+  while ((right = memchr(left, char_to_remove, end_ptr - left)) != NULL) {
+    size_t nbytes = right - left;
+
+    // check if we have enough space, including the null terminator.
+    if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
       return ERROR_WORD2BIG;
     }
-  }
-
-  char *dst = output;
-  const char *left = str;
-
-  // sliding window
-  for (const char *right = str; *left != '\0'; right++) {
-    if (*right == '\0' || *right == char_to_remove) {
-      size_t len = right - left;
+    // copy block
+    memcpy(&output[bytes_read], left, nbytes);
+    bytes_read += nbytes;
+    left = right + 1;
 
-      // copy block
-      memcpy(dst, left, len);
+    // Exit after processing the entire string
+    if (left >= end_ptr) {
+      break;
+    }
+  }
 
-      // go to next available location to write
-      dst += len;
-      left = *right == '\0' ? right : right + 1;
+  // copy final chunk that doesn't contain char_to_remove
+  if (end_ptr > left) {
+    size_t nbytes = nbytes = end_ptr - left;
+    if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
+      return ERROR_WORD2BIG;
     }
+    memcpy(&output[bytes_read], left, nbytes);
+    bytes_read += nbytes;
   }
 
   // null terminate
-  *dst = '\0';
+  output[bytes_read] = '\0';
   return 0;
 }
 

From b523a19a2c9758d4af3e95f904289fe2b9338096 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 17:20:19 -0300
Subject: [PATCH 31/65] test: add some edge cases tests with thousand separator

---
 pandas/tests/io/parser/common/test_common_basic.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 487520b7a9359..f38844e167222 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -73,7 +73,14 @@ def test_read_csv_local(all_parsers, csv1):
 
 
 @pytest.mark.parametrize(
-    "number_csv, expected_number", [("2,334", 2334), ("-2,334", -2334)]
+    "number_csv, expected_number",
+    [
+        ("2,334", 2334),
+        ("-2,334", -2334),
+        ("-2,334,", -2334),
+        ("2,,,,,,,,,,,,,,,5", 25),
+        ("2,,3,4,,,,,,,,,,,,5", 2345),
+    ],
 )
 def test_1000_sep(all_parsers, number_csv, expected_number):
     parser = all_parsers

From 6265172aa08ffed4b27e27794875f7e74bb8e87a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 18:17:41 -0300
Subject: [PATCH 32/65] Update pandas/_libs/src/parser/tokenizer.c

Co-authored-by: William Ayd <william.ayd@icloud.com>
---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a1457e2e385c7..74e6a5cbc6d51 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1886,7 +1886,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   const char *end_ptr = str + str_len;
   size_t bytes_read = 0;
 
-  while ((right = memchr(left, char_to_remove, end_ptr - left)) != NULL) {
+  while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) {
     size_t nbytes = right - left;
 
     // check if we have enough space, including the null terminator.

From abed6c1c7ec038cfd692b3c2cd2eba49643f82a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 18:19:00 -0300
Subject: [PATCH 33/65] fix: error to -1

---
 pandas/_libs/include/pandas/parser/tokenizer.h | 1 -
 pandas/_libs/src/parser/tokenizer.c            | 7 ++++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
index 141d883220f1e..209f375a5bf6c 100644
--- a/pandas/_libs/include/pandas/parser/tokenizer.h
+++ b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -17,7 +17,6 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
-#define ERROR_WORD2BIG 4
 
 #include <stdint.h>
 
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 74e6a5cbc6d51..8970fe1871248 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1891,7 +1891,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
     // check if we have enough space, including the null terminator.
     if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
-      return ERROR_WORD2BIG;
+      return -1;
     }
     // copy block
     memcpy(&output[bytes_read], left, nbytes);
@@ -1908,7 +1908,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   if (end_ptr > left) {
     size_t nbytes = nbytes = end_ptr - left;
     if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
-      return ERROR_WORD2BIG;
+      return -1;
     }
     memcpy(&output[bytes_read], left, nbytes);
     bytes_read += nbytes;
@@ -1941,7 +1941,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
 
     if (status != 0) {
-      *error = status;
+      // Word is too big, probably will cause an overflow
+      *error = ERROR_OVERFLOW;
       return 0;
     }
 

From 8616f9ffe1967a8f58a11ac295673aff3a8cf415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 19:12:46 -0300
Subject: [PATCH 34/65] fix: leftover status check

---
 pandas/_libs/src/parser/tokenizer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 8970fe1871248..fa7a1c755e3b9 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -2003,7 +2003,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
 
     if (status != 0) {
-      *error = status;
+      // Word is too big, probably will cause an overflow
+      *error = ERROR_OVERFLOW;
       return 0;
     }
     p_item = buffer;

From c4e0e25758c45dd4393effdda6f6af39d8b36071 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Tue, 14 Oct 2025 19:45:27 -0300
Subject: [PATCH 35/65] fix: remove duplicate nbytes declaration

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index fa7a1c755e3b9..0fdde3a6ecc25 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1906,7 +1906,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
   // copy final chunk that doesn't contain char_to_remove
   if (end_ptr > left) {
-    size_t nbytes = nbytes = end_ptr - left;
+    size_t nbytes = end_ptr - left;
     if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
       return -1;
     }

From 0b192085fa46b49bd159cc7800e766cfca979b95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 08:58:18 -0300
Subject: [PATCH 36/65] fix: use memchr to find if need to process the word

---
 pandas/_libs/src/parser/tokenizer.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 0fdde3a6ecc25..70e0096f788b8 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1937,8 +1937,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
   errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
-  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
-    int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
+  size_t str_len = strlen(p_item);
+  if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
+    int status = copy_string_without_char(buffer, p_item, str_len, tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow
@@ -1999,7 +2000,8 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
   errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
-  if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
+  size_t str_len = strlen(p_item);
+  if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
     int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
 
     if (status != 0) {

From 29d74f7396739735f4bc57794131d085ede53f51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 09:02:47 -0300
Subject: [PATCH 37/65] chore: add comment explaining why 128 bytes for
 capacity

---
 pandas/_libs/src/parser/tokenizer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 70e0096f788b8..b0ac55768295f 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -28,6 +28,8 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
 
+// Arrow256 allows up to 76 decimal digits.
+// We rounded up to the next power of 2.
 #define PROCESSED_WORD_CAPACITY 128
 
 void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,

From b5b8f3be4fa9f7e681f604f36662c9f9a51a20dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 10:49:38 -0300
Subject: [PATCH 38/65] rename bytes_read to bytes_written

---
 pandas/_libs/src/parser/tokenizer.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index b0ac55768295f..b3fadfa0a3e27 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1886,18 +1886,19 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   const char *left = str;
   const char *right;
   const char *end_ptr = str + str_len;
-  size_t bytes_read = 0;
+  size_t bytes_written = 0;
 
-  while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) {
+  while ((right = memchr(left, char_to_remove, str_len - bytes_written)) !=
+         NULL) {
     size_t nbytes = right - left;
 
     // check if we have enough space, including the null terminator.
-    if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
+    if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) {
       return -1;
     }
     // copy block
-    memcpy(&output[bytes_read], left, nbytes);
-    bytes_read += nbytes;
+    memcpy(&output[bytes_written], left, nbytes);
+    bytes_written += nbytes;
     left = right + 1;
 
     // Exit after processing the entire string
@@ -1909,15 +1910,15 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   // copy final chunk that doesn't contain char_to_remove
   if (end_ptr > left) {
     size_t nbytes = end_ptr - left;
-    if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
+    if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) {
       return -1;
     }
-    memcpy(&output[bytes_read], left, nbytes);
-    bytes_read += nbytes;
+    memcpy(&output[bytes_written], left, nbytes);
+    bytes_written += nbytes;
   }
 
   // null terminate
-  output[bytes_read] = '\0';
+  output[bytes_written] = '\0';
   return 0;
 }
 

From 803a8bfb7db3477f5e3226c0d05002ebb85401d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 11:17:25 -0300
Subject: [PATCH 39/65] chore: move errno assignment

---
 pandas/_libs/src/parser/tokenizer.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index b3fadfa0a3e27..a0f339ef112c0 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1938,7 +1938,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     return 0;
   }
 
-  errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
   size_t str_len = strlen(p_item);
   if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
@@ -1959,6 +1958,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   char *endptr = NULL;
+  // strtoll sets errno if it finds an overflow.
+  // It's value is reset to don't pollute the verification below.
+  errno = 0;
   int64_t result = strtoll(p_item, &endptr, 10);
 
   if (!has_only_spaces(endptr)) {
@@ -2001,7 +2003,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     return 0;
   }
 
-  errno = 0;
   char buffer[PROCESSED_WORD_CAPACITY];
   size_t str_len = strlen(p_item);
   if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
@@ -2016,6 +2017,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   char *endptr = NULL;
+  // strtoull sets errno if it finds an overflow.
+  // It's value is reset to don't pollute the verification below.
+  errno = 0;
   uint64_t result = strtoull(p_item, &endptr, 10);
 
   if (!has_only_spaces(endptr)) {

From 31f26cf0d13b720b2a489442c17763e37da58d98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 11:52:49 -0300
Subject: [PATCH 40/65] fix: fix error logic by comparing pointers

---
 pandas/_libs/src/parser/tokenizer.c | 31 ++++++++++-------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a0f339ef112c0..52e1105138549 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1888,33 +1888,22 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   const char *end_ptr = str + str_len;
   size_t bytes_written = 0;
 
-  while ((right = memchr(left, char_to_remove, str_len - bytes_written)) !=
-         NULL) {
-    size_t nbytes = right - left;
+  while (left < end_ptr) {
+    right = memchr(left, char_to_remove, str_len - bytes_written);
+
+    // If it doesn't find the char to remove, just copy until EOS.
+    size_t chunk_size = right ? right - left : end_ptr - left;
 
     // check if we have enough space, including the null terminator.
-    if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) {
+    if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
       return -1;
     }
     // copy block
-    memcpy(&output[bytes_written], left, nbytes);
-    bytes_written += nbytes;
-    left = right + 1;
+    memcpy(&output[bytes_written], left, chunk_size);
+    bytes_written += chunk_size;
 
-    // Exit after processing the entire string
-    if (left >= end_ptr) {
-      break;
-    }
-  }
-
-  // copy final chunk that doesn't contain char_to_remove
-  if (end_ptr > left) {
-    size_t nbytes = end_ptr - left;
-    if (nbytes + bytes_written >= PROCESSED_WORD_CAPACITY) {
-      return -1;
-    }
-    memcpy(&output[bytes_written], left, nbytes);
-    bytes_written += nbytes;
+    // Advance past the removed character if we found it.
+    left = right ? right + 1 : end_ptr;
   }
 
   // null terminate

From 171b553deb41f590cd219b32ae01e9417b809ccd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 12:45:00 -0300
Subject: [PATCH 41/65] fix: use pointers

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 52e1105138549..a2295489af109 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1889,7 +1889,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   size_t bytes_written = 0;
 
   while (left < end_ptr) {
-    right = memchr(left, char_to_remove, str_len - bytes_written);
+    right = memchr(left, char_to_remove, end_ptr - left);
 
     // If it doesn't find the char to remove, just copy until EOS.
     size_t chunk_size = right ? right - left : end_ptr - left;

From 30f6bdc4cab5319d8a2b2c0e86eaf99df3325b75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 13:26:10 -0300
Subject: [PATCH 42/65] fix: keep track on how many bytes to read

---
 pandas/_libs/src/parser/tokenizer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a2295489af109..dbbed27c1ee82 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1887,9 +1887,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   const char *right;
   const char *end_ptr = str + str_len;
   size_t bytes_written = 0;
+  size_t remaining_bytes_to_read = str_len;
 
-  while (left < end_ptr) {
-    right = memchr(left, char_to_remove, end_ptr - left);
+  while (remaining_bytes_to_read > 0) {
+    right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     // If it doesn't find the char to remove, just copy until EOS.
     size_t chunk_size = right ? right - left : end_ptr - left;
@@ -1904,6 +1905,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
     // Advance past the removed character if we found it.
     left = right ? right + 1 : end_ptr;
+    remaining_bytes_to_read -= right ? chunk_size + 1 : chunk_size;
   }
 
   // null terminate

From 554675bc990418543aeaca80af797e1116b71e4e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 13:28:35 -0300
Subject: [PATCH 43/65] chore: cast to size_t

---
 pandas/_libs/src/parser/tokenizer.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index dbbed27c1ee82..373ab0b9e70fe 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1893,7 +1893,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
     right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     // If it doesn't find the char to remove, just copy until EOS.
-    size_t chunk_size = right ? right - left : end_ptr - left;
+    // We are also casting directly to size_t because
+    // `left` never goes beyond `right` or `end_ptr`.
+    size_t chunk_size =
+        right ? (size_t)(right - left) : (size_t)(end_ptr - left);
 
     // check if we have enough space, including the null terminator.
     if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {

From 7ea445425926931c61f8212ef04eb72a957d66b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 13:30:44 -0300
Subject: [PATCH 44/65] fix: cast pointer to fix Wc++-compat warning

---
 pandas/_libs/src/parser/tokenizer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 373ab0b9e70fe..17db4e3b433c6 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1890,7 +1890,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   size_t remaining_bytes_to_read = str_len;
 
   while (remaining_bytes_to_read > 0) {
-    right = memchr(left, char_to_remove, remaining_bytes_to_read);
+    right = (const char *)memchr(left, char_to_remove, remaining_bytes_to_read);
 
     // If it doesn't find the char to remove, just copy until EOS.
     // We are also casting directly to size_t because

From 82dc037e4921be4725e35c24653fe04247ed05d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 15:09:18 -0300
Subject: [PATCH 45/65] fix: remove casts for -Weverything

---
 pandas/_libs/src/parser/tokenizer.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 17db4e3b433c6..dbbed27c1ee82 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1890,13 +1890,10 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   size_t remaining_bytes_to_read = str_len;
 
   while (remaining_bytes_to_read > 0) {
-    right = (const char *)memchr(left, char_to_remove, remaining_bytes_to_read);
+    right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     // If it doesn't find the char to remove, just copy until EOS.
-    // We are also casting directly to size_t because
-    // `left` never goes beyond `right` or `end_ptr`.
-    size_t chunk_size =
-        right ? (size_t)(right - left) : (size_t)(end_ptr - left);
+    size_t chunk_size = right ? right - left : end_ptr - left;
 
     // check if we have enough space, including the null terminator.
     if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {

From 627df4cc97c1ed352c5146be21179208bddbd222 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 15:37:05 -0300
Subject: [PATCH 46/65] fix: move remaining_bytes_to_read to the start of loop

---
 pandas/_libs/src/parser/tokenizer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index dbbed27c1ee82..3efe4a5b3d865 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1887,9 +1887,9 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   const char *right;
   const char *end_ptr = str + str_len;
   size_t bytes_written = 0;
-  size_t remaining_bytes_to_read = str_len;
 
-  while (remaining_bytes_to_read > 0) {
+  while (left < end_ptr) {
+    size_t remaining_bytes_to_read = end_ptr - left;
     right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     // If it doesn't find the char to remove, just copy until EOS.
@@ -1905,7 +1905,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
     // Advance past the removed character if we found it.
     left = right ? right + 1 : end_ptr;
-    remaining_bytes_to_read -= right ? chunk_size + 1 : chunk_size;
   }
 
   // null terminate

From 1b3eba0fde5bc06937a4eb6766792216b5645212 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 15:48:00 -0300
Subject: [PATCH 47/65] fix: consolidate it even further

---
 pandas/_libs/src/parser/tokenizer.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 3efe4a5b3d865..716154f3a06c7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1892,8 +1892,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
     size_t remaining_bytes_to_read = end_ptr - left;
     right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
-    // If it doesn't find the char to remove, just copy until EOS.
-    size_t chunk_size = right ? right - left : end_ptr - left;
+    if (!right) {
+      // If it doesn't find the char to remove, just copy until EOS.
+      right = end_ptr;
+    }
+
+    size_t chunk_size = right - left;
 
     // check if we have enough space, including the null terminator.
     if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
@@ -1904,7 +1908,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
     bytes_written += chunk_size;
 
     // Advance past the removed character if we found it.
-    left = right ? right + 1 : end_ptr;
+    left = right + 1;
   }
 
   // null terminate

From e1667faab453bdc5538b65e1565113266da0349a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 16:55:40 -0300
Subject: [PATCH 48/65] fix: move right definition

---
 pandas/_libs/src/parser/tokenizer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 716154f3a06c7..db816f5001588 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1884,13 +1884,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
                                     const char *str, size_t str_len,
                                     char char_to_remove) {
   const char *left = str;
-  const char *right;
   const char *end_ptr = str + str_len;
   size_t bytes_written = 0;
 
   while (left < end_ptr) {
     size_t remaining_bytes_to_read = end_ptr - left;
-    right = memchr(left, char_to_remove, remaining_bytes_to_read);
+    const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     if (!right) {
       // If it doesn't find the char to remove, just copy until EOS.

From bee776a41056658a4c62a57258213521c584e671 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 16:55:51 -0300
Subject: [PATCH 49/65] chore: remove superfluous comments

---
 pandas/_libs/src/parser/tokenizer.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index db816f5001588..065139d22bda5 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1898,15 +1898,12 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
     size_t chunk_size = right - left;
 
-    // check if we have enough space, including the null terminator.
     if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
       return -1;
     }
-    // copy block
     memcpy(&output[bytes_written], left, chunk_size);
     bytes_written += chunk_size;
 
-    // Advance past the removed character if we found it.
     left = right + 1;
   }
 

From d6933450a0d766ede4dd4297fec63bc4faf986c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 16:58:54 -0300
Subject: [PATCH 50/65] fix: add const qualifier

---
 pandas/_libs/src/parser/tokenizer.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 065139d22bda5..7c0df5abf144d 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1888,7 +1888,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
   size_t bytes_written = 0;
 
   while (left < end_ptr) {
-    size_t remaining_bytes_to_read = end_ptr - left;
+    const size_t remaining_bytes_to_read = end_ptr - left;
     const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
 
     if (!right) {
@@ -1896,7 +1896,7 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
       right = end_ptr;
     }
 
-    size_t chunk_size = right - left;
+    const size_t chunk_size = right - left;
 
     if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
       return -1;
@@ -1929,9 +1929,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   char buffer[PROCESSED_WORD_CAPACITY];
-  size_t str_len = strlen(p_item);
+  const size_t str_len = strlen(p_item);
   if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
-    int status = copy_string_without_char(buffer, p_item, str_len, tsep);
+    const int status = copy_string_without_char(buffer, p_item, str_len, tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow
@@ -1994,9 +1994,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   char buffer[PROCESSED_WORD_CAPACITY];
-  size_t str_len = strlen(p_item);
+  const size_t str_len = strlen(p_item);
   if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
-    int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
+    const int status =
+        copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow

From 593c614d7ac79c7cb989b83abc76e6fda6e7706e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 17:06:15 -0300
Subject: [PATCH 51/65] fix: remove unnecessary NULL and null-byte checks

---
 pandas/_libs/src/parser/tokenizer.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 7c0df5abf144d..32274c3a727ce 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1847,10 +1847,6 @@ int uint64_conflict(uint_state *self) {
  * @return Non-zero integer indicating that has a digit 0 otherwise.
  */
 static inline bool has_digit_int(const char *str) {
-  if (!str || *str == '\0') {
-    return false;
-  }
-
   switch (*str) {
   case '0':
   case '1':
@@ -1914,11 +1910,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
-  if (!p_item || *p_item == '\0') {
-    *error = ERROR_NO_DIGITS;
-    return 0;
-  }
-
   while (isspace_ascii(*p_item)) {
     ++p_item;
   }
@@ -1970,11 +1961,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
-  if (!p_item || *p_item == '\0') {
-    *error = ERROR_NO_DIGITS;
-    return 0;
-  }
-
   while (isspace_ascii(*p_item)) {
     ++p_item;
   }

From ff4d48b672a8641961689466f6e58e337e5959f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 17:09:45 -0300
Subject: [PATCH 52/65] fix: remove unnecessary errno verification

---
 pandas/_libs/src/parser/tokenizer.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 32274c3a727ce..a05d8f98a6673 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1933,11 +1933,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     p_item = buffer;
   }
 
-  if (errno == ERANGE) {
-    *error = ERROR_OVERFLOW;
-    return 0;
-  }
-
   char *endptr = NULL;
   // strtoll sets errno if it finds an overflow.
   // It's value is reset to don't pollute the verification below.

From e3a88d31221d2b9e99bb4d34b912b36e1263bb3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 17:15:37 -0300
Subject: [PATCH 53/65] chore: remove NULL assignment

---
 pandas/_libs/src/parser/tokenizer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a05d8f98a6673..65f6877357c14 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1933,7 +1933,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     p_item = buffer;
   }
 
-  char *endptr = NULL;
+  char *endptr;
   // strtoll sets errno if it finds an overflow.
   // It's value is reset to don't pollute the verification below.
   errno = 0;
@@ -1988,7 +1988,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     p_item = buffer;
   }
 
-  char *endptr = NULL;
+  char *endptr;
   // strtoull sets errno if it finds an overflow.
   // It's value is reset to don't pollute the verification below.
   errno = 0;

From b135738d85825f2eff0cdb4cd805e1b50e879bb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 17:31:58 -0300
Subject: [PATCH 54/65] fix: don't recompute strlen

---
 pandas/_libs/src/parser/tokenizer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 65f6877357c14..0609e07362850 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1977,8 +1977,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   char buffer[PROCESSED_WORD_CAPACITY];
   const size_t str_len = strlen(p_item);
   if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
-    const int status =
-        copy_string_without_char(buffer, p_item, strlen(p_item), tsep);
+    const int status = copy_string_without_char(buffer, p_item, str_len, tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow

From ba8c9b3b4f4ba68cfa23475326796c0f16e6533b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 17:38:54 -0300
Subject: [PATCH 55/65] chore: add some comments back to simplify diff

---
 pandas/_libs/src/parser/tokenizer.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 0609e07362850..ca48aecf84fc7 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1910,11 +1910,14 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
+  // Skip leading spaces.
   while (isspace_ascii(*p_item)) {
     ++p_item;
   }
 
+  // Check that there is a first digit.
   if (!has_digit_int(p_item)) {
+    // Error...
     *error = ERROR_NO_DIGITS;
     return 0;
   }
@@ -1939,6 +1942,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   errno = 0;
   int64_t result = strtoll(p_item, &endptr, 10);
 
+  // Did we use up all the characters?
   if (!has_only_spaces(endptr)) {
     // Check first for invalid characters because we may
     // want to skip integer parsing if we find one.
@@ -1956,10 +1960,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
+  // Skip leading spaces.
   while (isspace_ascii(*p_item)) {
     ++p_item;
   }
 
+  // Handle sign.
   if (*p_item == '-') {
     state->seen_sint = 1;
     *error = 0;
@@ -1970,6 +1976,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
 
   // Check that there is a first digit.
   if (!isdigit_ascii(*p_item)) {
+    // Error...
     *error = ERROR_NO_DIGITS;
     return 0;
   }
@@ -1993,6 +2000,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   errno = 0;
   uint64_t result = strtoull(p_item, &endptr, 10);
 
+  // Did we use up all the characters?
   if (!has_only_spaces(endptr)) {
     *error = ERROR_INVALID_CHARS;
     result = 0;

From 818921fd3ff49fd8e02ba25de15f4cb7055ad275 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 18:14:21 -0300
Subject: [PATCH 56/65] fix: reset errno after handling it

---
 pandas/_libs/src/parser/tokenizer.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index ca48aecf84fc7..a1b0444386fe3 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1937,9 +1937,6 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   char *endptr;
-  // strtoll sets errno if it finds an overflow.
-  // It's value is reset to don't pollute the verification below.
-  errno = 0;
   int64_t result = strtoll(p_item, &endptr, 10);
 
   // Did we use up all the characters?
@@ -1950,6 +1947,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     result = 0;
   } else if (errno == ERANGE || result > int_max || result < int_min) {
     *error = ERROR_OVERFLOW;
+    errno = 0;
     result = 0;
   } else {
     *error = 0;
@@ -1995,9 +1993,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   char *endptr;
-  // strtoull sets errno if it finds an overflow.
-  // It's value is reset to don't pollute the verification below.
-  errno = 0;
   uint64_t result = strtoull(p_item, &endptr, 10);
 
   // Did we use up all the characters?
@@ -2006,6 +2001,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     result = 0;
   } else if (errno == ERANGE || result > uint_max) {
     *error = ERROR_OVERFLOW;
+    errno = 0;
     result = 0;
   } else {
     *error = 0;

From 3e067f76de1997cd1558ef2aa47d06e40c913d54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 18:27:59 -0300
Subject: [PATCH 57/65] fix: put back const char p

---
 pandas/_libs/src/parser/tokenizer.c | 40 +++++++++++++++--------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index a1b0444386fe3..45d73a2f7d39f 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1910,22 +1910,23 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
 
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
+  const char *p = p_item;
   // Skip leading spaces.
-  while (isspace_ascii(*p_item)) {
-    ++p_item;
+  while (isspace_ascii(*p)) {
+    ++p;
   }
 
   // Check that there is a first digit.
-  if (!has_digit_int(p_item)) {
+  if (!has_digit_int(p)) {
     // Error...
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
   char buffer[PROCESSED_WORD_CAPACITY];
-  const size_t str_len = strlen(p_item);
-  if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
-    const int status = copy_string_without_char(buffer, p_item, str_len, tsep);
+  const size_t str_len = strlen(p);
+  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
+    const int status = copy_string_without_char(buffer, p, str_len, tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow
@@ -1933,11 +1934,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
       return 0;
     }
 
-    p_item = buffer;
+    p = buffer;
   }
 
   char *endptr;
-  int64_t result = strtoll(p_item, &endptr, 10);
+  int64_t result = strtoll(p, &endptr, 10);
 
   // Did we use up all the characters?
   if (!has_only_spaces(endptr)) {
@@ -1958,42 +1959,43 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
+  const char *p = p_item;
   // Skip leading spaces.
-  while (isspace_ascii(*p_item)) {
-    ++p_item;
+  while (isspace_ascii(*p)) {
+    ++p;
   }
 
   // Handle sign.
-  if (*p_item == '-') {
+  if (*p == '-') {
     state->seen_sint = 1;
     *error = 0;
     return 0;
-  } else if (*p_item == '+') {
-    p_item++;
+  } else if (*p == '+') {
+    p++;
   }
 
   // Check that there is a first digit.
-  if (!isdigit_ascii(*p_item)) {
+  if (!isdigit_ascii(*p)) {
     // Error...
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
   char buffer[PROCESSED_WORD_CAPACITY];
-  const size_t str_len = strlen(p_item);
-  if (tsep != '\0' && memchr(p_item, tsep, str_len) != NULL) {
-    const int status = copy_string_without_char(buffer, p_item, str_len, tsep);
+  const size_t str_len = strlen(p);
+  if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
+    const int status = copy_string_without_char(buffer, p, str_len, tsep);
 
     if (status != 0) {
       // Word is too big, probably will cause an overflow
       *error = ERROR_OVERFLOW;
       return 0;
     }
-    p_item = buffer;
+    p = buffer;
   }
 
   char *endptr;
-  uint64_t result = strtoull(p_item, &endptr, 10);
+  uint64_t result = strtoull(p, &endptr, 10);
 
   // Did we use up all the characters?
   if (!has_only_spaces(endptr)) {

From c0ed83c886b774feacae39cbaab79991423305a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 19:04:03 -0300
Subject: [PATCH 58/65] fix: improve diff for sign handling

---
 pandas/_libs/src/parser/tokenizer.c | 35 +++++------------------------
 1 file changed, 6 insertions(+), 29 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 45d73a2f7d39f..7f3a632982f54 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1839,34 +1839,6 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-/**
- * @brief Check if the character in the pointer indicates a number.
- * It expects that you consumed all leading whitespace.
- *
- * @param p_item Pointer to verify
- * @return Non-zero integer indicating that has a digit 0 otherwise.
- */
-static inline bool has_digit_int(const char *str) {
-  switch (*str) {
-  case '0':
-  case '1':
-  case '2':
-  case '3':
-  case '4':
-  case '5':
-  case '6':
-  case '7':
-  case '8':
-  case '9':
-    return true;
-  case '+':
-  case '-':
-    return str[1] != '\0' && isdigit_ascii(str[1]);
-  default:
-    return false;
-  }
-}
-
 static inline bool has_only_spaces(const char *str) {
   while (*str != '\0' && isspace_ascii(*str)) {
     str++;
@@ -1916,8 +1888,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     ++p;
   }
 
+  // Handle sign.
+  const bool has_sign = *p == '-' || *p == '+';
+  // Handle sign.
+  const char *digit_start = has_sign ? p + 1 : p;
+
   // Check that there is a first digit.
-  if (!has_digit_int(p)) {
+  if (!isdigit_ascii(*digit_start)) {
     // Error...
     *error = ERROR_NO_DIGITS;
     return 0;

From cb60adb10ab9f6d1cb9bc5f6f6f0180d047281b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 19:07:56 -0300
Subject: [PATCH 59/65] fix: improve diff for trailing whitespace

---
 pandas/_libs/src/parser/tokenizer.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 7f3a632982f54..5a9f3bfedb8a6 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1839,13 +1839,6 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
-static inline bool has_only_spaces(const char *str) {
-  while (*str != '\0' && isspace_ascii(*str)) {
-    str++;
-  }
-  return *str == '\0';
-}
-
 /* Copy a string without `char_to_remove` into `output`.
  */
 static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
@@ -1917,10 +1910,13 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   char *endptr;
   int64_t result = strtoll(p, &endptr, 10);
 
+  // Skip trailing spaces.
+  while (isspace_ascii(*endptr)) {
+    ++endptr;
+  }
+
   // Did we use up all the characters?
-  if (!has_only_spaces(endptr)) {
-    // Check first for invalid characters because we may
-    // want to skip integer parsing if we find one.
+  if (*endptr) {
     *error = ERROR_INVALID_CHARS;
     result = 0;
   } else if (errno == ERANGE || result > int_max || result < int_min) {
@@ -1974,8 +1970,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   char *endptr;
   uint64_t result = strtoull(p, &endptr, 10);
 
+  // Skip trailing spaces.
+  while (isspace_ascii(*endptr)) {
+    ++endptr;
+  }
+
   // Did we use up all the characters?
-  if (!has_only_spaces(endptr)) {
+  if (*endptr) {
     *error = ERROR_INVALID_CHARS;
     result = 0;
   } else if (errno == ERANGE || result > uint_max) {

From 5117e89658f53b720ac1ed3544d670127378b18f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 19:24:56 -0300
Subject: [PATCH 60/65] chore: remove newline to simplify diff even more

---
 pandas/_libs/src/parser/tokenizer.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 5a9f3bfedb8a6..37d77de06fba6 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1897,13 +1897,11 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   const size_t str_len = strlen(p);
   if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
     const int status = copy_string_without_char(buffer, p, str_len, tsep);
-
     if (status != 0) {
       // Word is too big, probably will cause an overflow
       *error = ERROR_OVERFLOW;
       return 0;
     }
-
     p = buffer;
   }
 
@@ -1958,7 +1956,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   const size_t str_len = strlen(p);
   if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
     const int status = copy_string_without_char(buffer, p, str_len, tsep);
-
     if (status != 0) {
       // Word is too big, probably will cause an overflow
       *error = ERROR_OVERFLOW;

From e1e327a614c151c93e494c9f345c59571c66bac3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 19:37:28 -0300
Subject: [PATCH 61/65] chore: drop another superfluous comment

---
 pandas/_libs/src/parser/tokenizer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 37d77de06fba6..d9a754522251d 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1868,7 +1868,6 @@ static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
     left = right + 1;
   }
 
-  // null terminate
   output[bytes_written] = '\0';
   return 0;
 }

From ffcb7c21f2bee19edabebd0775f919e5165886d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 20:00:11 -0300
Subject: [PATCH 62/65] test: xfail python engine with consecutive thousand
 separators

---
 pandas/tests/io/parser/common/test_common_basic.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index f38844e167222..766dabba851e0 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -82,7 +82,7 @@ def test_read_csv_local(all_parsers, csv1):
         ("2,,3,4,,,,,,,,,,,,5", 2345),
     ],
 )
-def test_1000_sep(all_parsers, number_csv, expected_number):
+def test_1000_sep(all_parsers, number_csv, expected_number, request):
     parser = all_parsers
     data = f"""A|B|C
 1|{number_csv}|5
@@ -95,6 +95,11 @@ def test_1000_sep(all_parsers, number_csv, expected_number):
         with pytest.raises(ValueError, match=msg):
             parser.read_csv(StringIO(data), sep="|", thousands=",")
         return
+    elif parser.engine == "python" and ",," in number_csv:
+        mark = pytest.mark.xfail(
+            reason="Python engine doesn't allow consecutive thousands separators"
+        )
+        request.applymarker(mark)
 
     result = parser.read_csv(StringIO(data), sep="|", thousands=",")
     tm.assert_frame_equal(result, expected)

From 47b87f9aeed4c0bb836505173a4bef7a661e23ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 21:52:01 -0300
Subject: [PATCH 63/65] fix: move errno handling to avoid polution and early
 return

---
 pandas/_libs/src/parser/tokenizer.c | 30 +++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index d9a754522251d..4c2a52d6198da 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1907,6 +1907,12 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   char *endptr;
   int64_t result = strtoll(p, &endptr, 10);
 
+  if (errno == ERANGE || result > int_max || result < int_min) {
+    *error = ERROR_OVERFLOW;
+    errno = 0;
+    return 0;
+  }
+
   // Skip trailing spaces.
   while (isspace_ascii(*endptr)) {
     ++endptr;
@@ -1915,15 +1921,10 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   // Did we use up all the characters?
   if (*endptr) {
     *error = ERROR_INVALID_CHARS;
-    result = 0;
-  } else if (errno == ERANGE || result > int_max || result < int_min) {
-    *error = ERROR_OVERFLOW;
-    errno = 0;
-    result = 0;
-  } else {
-    *error = 0;
+    return 0;
   }
 
+  *error = 0;
   return result;
 }
 
@@ -1966,6 +1967,12 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   char *endptr;
   uint64_t result = strtoull(p, &endptr, 10);
 
+  if (errno == ERANGE || result > uint_max) {
+    *error = ERROR_OVERFLOW;
+    errno = 0;
+    return 0;
+  }
+
   // Skip trailing spaces.
   while (isspace_ascii(*endptr)) {
     ++endptr;
@@ -1974,18 +1981,13 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   // Did we use up all the characters?
   if (*endptr) {
     *error = ERROR_INVALID_CHARS;
-    result = 0;
-  } else if (errno == ERANGE || result > uint_max) {
-    *error = ERROR_OVERFLOW;
-    errno = 0;
-    result = 0;
-  } else {
-    *error = 0;
+    return 0;
   }
 
   if (result > (uint64_t)int_max) {
     state->seen_uint = 1;
   }
 
+  *error = 0;
   return result;
 }

From cd536fbd4893cfdafaef8a07cf60c91da3d52aa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Wed, 15 Oct 2025 22:04:08 -0300
Subject: [PATCH 64/65] chore: rename to number for diff

---
 pandas/_libs/src/parser/tokenizer.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
index 4c2a52d6198da..b77e8ab2254a3 100644
--- a/pandas/_libs/src/parser/tokenizer.c
+++ b/pandas/_libs/src/parser/tokenizer.c
@@ -1905,9 +1905,9 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   char *endptr;
-  int64_t result = strtoll(p, &endptr, 10);
+  int64_t number = strtoll(p, &endptr, 10);
 
-  if (errno == ERANGE || result > int_max || result < int_min) {
+  if (errno == ERANGE || number > int_max || number < int_min) {
     *error = ERROR_OVERFLOW;
     errno = 0;
     return 0;
@@ -1925,7 +1925,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
   }
 
   *error = 0;
-  return result;
+  return number;
 }
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
@@ -1965,9 +1965,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
   }
 
   char *endptr;
-  uint64_t result = strtoull(p, &endptr, 10);
+  uint64_t number = strtoull(p, &endptr, 10);
 
-  if (errno == ERANGE || result > uint_max) {
+  if (errno == ERANGE || number > uint_max) {
     *error = ERROR_OVERFLOW;
     errno = 0;
     return 0;
@@ -1984,10 +1984,10 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
     return 0;
   }
 
-  if (result > (uint64_t)int_max) {
+  if (number > (uint64_t)int_max) {
     state->seen_uint = 1;
   }
 
   *error = 0;
-  return result;
+  return number;
 }

From 1ef92593e3891859982426d67daf728c8621b362 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Kothe?= <kothe65@gmail.com>
Date: Thu, 16 Oct 2025 10:56:43 -0300
Subject: [PATCH 65/65] chore: add comment explaining consecutive thousand
 separators in C engine

---
 pandas/tests/io/parser/common/test_common_basic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
index 766dabba851e0..88dd0543b9020 100644
--- a/pandas/tests/io/parser/common/test_common_basic.py
+++ b/pandas/tests/io/parser/common/test_common_basic.py
@@ -78,6 +78,8 @@ def test_read_csv_local(all_parsers, csv1):
         ("2,334", 2334),
         ("-2,334", -2334),
         ("-2,334,", -2334),
+        # Multiple consecutive thousand separators are allowed in C engine,
+        # but it's not necessarily intended behavior and may change in the future.
         ("2,,,,,,,,,,,,,,,5", 25),
         ("2,,3,4,,,,,,,,,,,,5", 2345),
     ],