diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 61e96fc835e4d..b77e8ab2254a3 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these. #include #include #include +#include #include "pandas/portable.h" #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 +// Arrow256 allows up to 76 decimal digits. +// We rounded up to the next power of 2. +#define PROCESSED_WORD_CAPACITY 128 + void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, int64_t start) { // column i, starting at 0 @@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) { return self->seen_uint && (self->seen_sint || self->seen_null); } +/* Copy a string without `char_to_remove` into `output`. + */ +static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], + const char *str, size_t str_len, + char char_to_remove) { + const char *left = str; + const char *end_ptr = str + str_len; + size_t bytes_written = 0; + + while (left < end_ptr) { + const size_t remaining_bytes_to_read = end_ptr - left; + const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); + + if (!right) { + // If it doesn't find the char to remove, just copy until EOS. + right = end_ptr; + } + + const size_t chunk_size = right - left; + + if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { + return -1; + } + memcpy(&output[bytes_written], left, chunk_size); + bytes_written += chunk_size; + + left = right + 1; + } + + output[bytes_written] = '\0'; + return 0; +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = p_item; @@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, } // Handle sign. - const bool isneg = *p == '-' ? true : false; + const bool has_sign = *p == '-' || *p == '+'; // Handle sign. - if (isneg || (*p == '+')) { - p++; - } + const char *digit_start = has_sign ? p + 1 : p; // Check that there is a first digit. - if (!isdigit_ascii(*p)) { + if (!isdigit_ascii(*digit_start)) { // Error... *error = ERROR_NO_DIGITS; return 0; } - int64_t number = 0; - if (isneg) { - // If number is greater than pre_min, at least one more digit - // can be processed without overflowing. - int dig_pre_min = -(int_min % 10); - int64_t pre_min = int_min / 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number > pre_min) || - ((number == pre_min) && (d - '0' <= dig_pre_min))) { - number = number * 10 - (d - '0'); - d = *++p; - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + char buffer[PROCESSED_WORD_CAPACITY]; + const size_t str_len = strlen(p); + if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { + const int status = copy_string_without_char(buffer, p, str_len, tsep); + if (status != 0) { + // Word is too big, probably will cause an overflow + *error = ERROR_OVERFLOW; + return 0; } - } else { - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - int64_t pre_max = int_max / 10; - int dig_pre_max = int_max % 10; - - // Process the digits. - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; + p = buffer; + } - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && (d - '0' <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; + char *endptr; + int64_t number = strtoll(p, &endptr, 10); - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } - } + if (errno == ERANGE || number > int_max || number < int_min) { + *error = ERROR_OVERFLOW; + errno = 0; + return 0; } // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; + while (isspace_ascii(*endptr)) { + ++endptr; } // Did we use up all the characters? - if (*p) { + if (*endptr) { *error = ERROR_INVALID_CHARS; return 0; } @@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, return 0; } - // If number is less than pre_max, at least one more digit - // can be processed without overflowing. - // - // Process the digits. - uint64_t number = 0; - const uint64_t pre_max = uint_max / 10; - const uint64_t dig_pre_max = uint_max % 10; - char d = *p; - if (tsep != '\0') { - while (1) { - if (d == tsep) { - d = *++p; - continue; - } else if (!isdigit_ascii(d)) { - break; - } - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; - - } else { - *error = ERROR_OVERFLOW; - return 0; - } + char buffer[PROCESSED_WORD_CAPACITY]; + const size_t str_len = strlen(p); + if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { + const int status = copy_string_without_char(buffer, p, str_len, tsep); + if (status != 0) { + // Word is too big, probably will cause an overflow + *error = ERROR_OVERFLOW; + return 0; } - } else { - while (isdigit_ascii(d)) { - if ((number < pre_max) || - ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { - number = number * 10 + (d - '0'); - d = *++p; + p = buffer; + } - } else { - *error = ERROR_OVERFLOW; - return 0; - } - } + char *endptr; + uint64_t number = strtoull(p, &endptr, 10); + + if (errno == ERANGE || number > uint_max) { + *error = ERROR_OVERFLOW; + errno = 0; + return 0; } // Skip trailing spaces. - while (isspace_ascii(*p)) { - ++p; + while (isspace_ascii(*endptr)) { + ++endptr; } // Did we use up all the characters? - if (*p) { + if (*endptr) { *error = ERROR_INVALID_CHARS; return 0; } diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3680273f5e98a..766dabba851e0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -72,19 +72,34 @@ def test_read_csv_local(all_parsers, csv1): tm.assert_frame_equal(result, expected) -def test_1000_sep(all_parsers): +@pytest.mark.parametrize( + "number_csv, expected_number", + [ + ("2,334", 2334), + ("-2,334", -2334), + ("-2,334,", -2334), + ("2,,,,,,,,,,,,,,,5", 25), + ("2,,3,4,,,,,,,,,,,,5", 2345), + ], +) +def test_1000_sep(all_parsers, number_csv, expected_number, request): parser = all_parsers - data = """A|B|C -1|2,334|5 + data = f"""A|B|C +1|{number_csv}|5 10|13|10. """ - expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]}) + expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]}) if parser.engine == "pyarrow": msg = "The 'thousands' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), sep="|", thousands=",") return + elif parser.engine == "python" and ",," in number_csv: + mark = pytest.mark.xfail( + reason="Python engine doesn't allow consecutive thousands separators" + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data), sep="|", thousands=",") tm.assert_frame_equal(result, expected)