diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8d8691ada1d38..27a7d2b4c99ec 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1716,63 +1716,148 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, return number; } +/* Helper functions and macros for string consumption and buffer management */ + +static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, + size_t src_sz, const char *charset) { + size_t span_sz = strspn(*src, charset); + if (span_sz > src_sz) { + span_sz = src_sz; + } + // Assuming the span size is within the expected range. + if (span_sz > INT_MAX) { + return -1; + } + if (dst) { + if (span_sz > dst_sz) { + return -1; + } + memcpy(*dst, *src, span_sz); + *dst += span_sz; + } + *src += span_sz; + return (int)span_sz; +} + +static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, + const char *charset) { + return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset); +} + +#define SKIP_SPAN(s, charset) str_consume_span(NULL, 0, &(s), (charset)) + +#define SKIP_NSPAN(s, n, charset) \ + str_consume_nspan(NULL, 0, &(s), (n), (charset)) + +#define SAFE_CONSUME_SPAN(d, de, s, charset) \ + do { \ + size_t _remaining = (de) - (d); \ + int _ret = str_consume_span(&(d), _remaining, &(s), (charset)); \ + if (_ret < 0) \ + return ERROR_OVERFLOW; \ + } while (0) + +#define SAFE_CONSUME_NSPAN(d, de, s, n, charset) \ + do { \ + size_t _remaining = (de) - (d); \ + if (str_consume_nspan(&(d), _remaining, &(s), (n), (charset)) < 0) \ + return ERROR_OVERFLOW; \ + } while (0) + +#define CHECK_BUFFER_SPACE(d, de) \ + do { \ + if ((d) >= (de)) \ + return ERROR_OVERFLOW; \ + } while (0) + /* copy a decimal number string with `decimal`, `tsep` as decimal point and thousands separator to an equivalent c-locale decimal string (striping - `tsep`, replacing `decimal` with '.'). The returned memory should be free-d - with a call to `free`. -*/ + `tsep`, replacing `decimal` with '.'). The result is written into `dst` + and null-terminated. */ + +static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, + char **endpos, char decimal, char tsep) { + static const char *digits = "0123456789"; + static const char *exponents = "Ee"; + static const char *signs = "+-"; + static const char *whitespaces = " \t\n\v\f\r"; + + const char decimals[] = {decimal, '\0'}; + const char tseps[] = {tsep, '\0'}; + + const char *s = src; + char *d = dst; + const char *de = dst + dst_sz; + bool seen_digit = false; + int ret; + + if (endpos != NULL) + *endpos = (char *)s; + + // Skip leading whitespace + SKIP_SPAN(s, whitespaces); + + // Copy leading sign (optional) + SAFE_CONSUME_NSPAN(d, de, s, 1, signs); -static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, - char tsep) { - const char *p = s; - const size_t length = strlen(s); - char *s_copy = malloc(length + 1); - char *dst = s_copy; - // Skip leading whitespace. - while (isspace_ascii(*p)) - p++; - // Copy Leading sign - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } // Copy integer part dropping `tsep` - while (isdigit_ascii(*p)) { - *dst++ = *p++; - p += (tsep != '\0' && *p == tsep); + while ((ret = str_consume_span(&d, de - d, &s, digits))) { + if (ret < 0) + return ERROR_OVERFLOW; + seen_digit = true; + SKIP_SPAN(s, tseps); } + // Replace `decimal` with '.' - if (*p == decimal) { - *dst++ = '.'; - p++; + if (SKIP_NSPAN(s, 1, decimals)) { + CHECK_BUFFER_SPACE(d, de); + *d++ = '.'; } + // Copy fractional part after decimal (if any) - while (isdigit_ascii(*p)) { - *dst++ = *p++; + if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) { + seen_digit = true; + } else if (ret < 0) { + return ERROR_OVERFLOW; + } + + if (!seen_digit) { + // No digits found in integer or fractional part + return ERROR_NO_DIGITS; } + // Copy exponent if any - if (toupper_ascii(*p) == toupper_ascii('E')) { - *dst++ = *p++; - // Copy leading exponent sign (if any) - if (*p == '+' || *p == '-') { - *dst++ = *p++; - } - // Copy exponent digits - while (isdigit_ascii(*p)) { - *dst++ = *p++; - } + if (decimal && (ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { + SAFE_CONSUME_NSPAN(d, de, s, 1, signs); + SAFE_CONSUME_SPAN(d, de, s, digits); + } else if (ret < 0) { + return ERROR_OVERFLOW; } - *dst++ = '\0'; // terminate + + // Skip trailing whitespace + SKIP_SPAN(s, whitespaces); + + // Terminate string + CHECK_BUFFER_SPACE(d, de); + *d++ = '\0'; + + // Did we use up all the characters? + if (*s) + return ERROR_INVALID_CHARS; + if (endpos != NULL) - *endpos = (char *)p; - return s_copy; + *endpos = (char *)s; + + return 0; } double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), char tsep, int skip_trailing, int *error, int *maybe_int) { // 'normalize' representation to C-locale; replace decimal with '.' and // remove thousands separator. + char pc[PROCESSED_WORD_CAPACITY]; char *endptr; - char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); + _str_copy_decimal_str_c(pc, sizeof(pc), p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate = PyGILState_Ensure(); @@ -1799,7 +1884,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), PyErr_Clear(); PyGILState_Release(gstate); - free(pc); if (skip_trailing && q != NULL && *q != p) { while (isspace_ascii(**q)) { (*q)++; @@ -1821,72 +1905,17 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } -/* Copy a string without `char_to_remove` into `output`. - */ -static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], - const char *str, size_t str_len, - char char_to_remove) { - const char *left = str; - const char *end_ptr = str + str_len; - size_t bytes_written = 0; - - while (left < end_ptr) { - const size_t remaining_bytes_to_read = end_ptr - left; - const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); - - if (!right) { - // If it doesn't find the char to remove, just copy until EOS. - right = end_ptr; - } - - const size_t chunk_size = right - left; - - if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { - return -1; - } - memcpy(&output[bytes_written], left, chunk_size); - bytes_written += chunk_size; - - left = right + 1; - } - - output[bytes_written] = '\0'; - return 0; -} - int64_t str_to_int64(const char *p_item, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; - } - - // Handle sign. - const bool has_sign = *p == '-' || *p == '+'; - // Handle sign. - const char *digit_start = has_sign ? p + 1 : p; - - // Check that there is a first digit. - if (!isdigit_ascii(*digit_start)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p); - if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { - // Word is too big, probably will cause an overflow - *error = ERROR_OVERFLOW; - return 0; - } - p = buffer; + char *endptr; + int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, + &endptr, '\0', tsep); + if (status != 0) { + *error = status; + return 0; } - char *endptr; - int64_t number = strtoll(p, &endptr, 10); + int64_t number = strtoll(buffer, &endptr, 10); if (errno == ERANGE) { *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW; @@ -1894,59 +1923,28 @@ int64_t str_to_int64(const char *p_item, int *error, char tsep) { return 0; } - // Skip trailing spaces. - while (isspace_ascii(*endptr)) { - ++endptr; - } - - // Did we use up all the characters? - if (*endptr) { - *error = ERROR_INVALID_CHARS; - return 0; - } - *error = 0; return number; } uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error, char tsep) { - const char *p = p_item; - // Skip leading spaces. - while (isspace_ascii(*p)) { - ++p; + char buffer[PROCESSED_WORD_CAPACITY]; + char *endptr; + int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, + &endptr, '\0', tsep); + if (status != 0) { + *error = status; + return 0; } - // Handle sign. - if (*p == '-') { + if (buffer[0] == '-') { state->seen_sint = 1; *error = 0; return 0; - } else if (*p == '+') { - p++; - } - - // Check that there is a first digit. - if (!isdigit_ascii(*p)) { - // Error... - *error = ERROR_NO_DIGITS; - return 0; - } - - char buffer[PROCESSED_WORD_CAPACITY]; - const size_t str_len = strlen(p); - if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { - const int status = copy_string_without_char(buffer, p, str_len, tsep); - if (status != 0) { - // Word is too big, probably will cause an overflow - *error = ERROR_OVERFLOW; - return 0; - } - p = buffer; } - char *endptr; - uint64_t number = strtoull(p, &endptr, 10); + uint64_t number = strtoull(buffer, &endptr, 10); if (errno == ERANGE) { *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW; @@ -1954,17 +1952,6 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error, return 0; } - // Skip trailing spaces. - while (isspace_ascii(*endptr)) { - ++endptr; - } - - // Did we use up all the characters? - if (*endptr) { - *error = ERROR_INVALID_CHARS; - return 0; - } - if (number > (uint64_t)INT64_MAX) { state->seen_uint = 1; }