Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c3cc4a1
refactor(parser): use integer parsing functions from stdlib
Alvaro-Kothe Oct 11, 2025
2459313
perf: use a local buffer to store the processed string
Alvaro-Kothe Oct 13, 2025
d8a454e
fix: use macro to fix MSVC build error
Alvaro-Kothe Oct 13, 2025
87789e6
fix: use `bool`
Alvaro-Kothe Oct 13, 2025
2287944
refactor: don't pass PROCESSED_WORD_CAPACITY as a separate argument
Alvaro-Kothe Oct 13, 2025
2bea3c2
perf: write in chunks
Alvaro-Kothe Oct 13, 2025
fb38679
hack: try bigger buffer size for arm error
Alvaro-Kothe Oct 13, 2025
f9ede5c
fix: solution without manipulating the string
Alvaro-Kothe Oct 13, 2025
798c263
some cleanup
Alvaro-Kothe Oct 13, 2025
2f06f19
fix: use ptrdiff_t to fix MSVC build error
Alvaro-Kothe Oct 13, 2025
280b55e
add other exponent cases for completion
Alvaro-Kothe Oct 14, 2025
d85aaf0
fix: use builtin overflow check verification
Alvaro-Kothe Oct 14, 2025
9046ecc
fix: change std to c2x
Alvaro-Kothe Oct 14, 2025
a4e2fb8
Revert previous commits
Alvaro-Kothe Oct 14, 2025
ef82cf4
refactor: move overflow check to header
Alvaro-Kothe Oct 14, 2025
5afeb11
refactor: use overflow check from numpy
Alvaro-Kothe Oct 14, 2025
0ef47a7
fix: handle negative check
Alvaro-Kothe Oct 14, 2025
c840ef0
fix: add test for thousand separator with negative number
Alvaro-Kothe Oct 14, 2025
132342b
move to portable
Alvaro-Kothe Oct 14, 2025
e6977cc
perf: use builtin unsigned long overflow check
Alvaro-Kothe Oct 14, 2025
8120eea
refactor: combine builting and gnuc branches
Alvaro-Kothe Oct 14, 2025
1f5d506
don't assign null
Alvaro-Kothe Oct 14, 2025
479a2ab
fix: perform bound check
Alvaro-Kothe Oct 14, 2025
c37c355
fix: assign error if doesn't have a digit after tsep
Alvaro-Kothe Oct 14, 2025
7d55283
fix: go back to buffer solution
Alvaro-Kothe Oct 14, 2025
ffe50ce
refactor: undo refactor in np_datetime.c
Alvaro-Kothe Oct 14, 2025
af5ad71
fix: fix undefined behavior
Alvaro-Kothe Oct 14, 2025
9211704
fix: fix leftover undefined behaviour
Alvaro-Kothe Oct 14, 2025
d026b01
rewrite `copy_string_without_char`
Alvaro-Kothe Oct 14, 2025
d76ff5f
fix: change solution to safe guard against end_ptr
Alvaro-Kothe Oct 14, 2025
b523a19
test: add some edge cases tests with thousand separator
Alvaro-Kothe Oct 14, 2025
6265172
Update pandas/_libs/src/parser/tokenizer.c
Alvaro-Kothe Oct 14, 2025
abed6c1
fix: error to -1
Alvaro-Kothe Oct 14, 2025
8616f9f
fix: leftover status check
Alvaro-Kothe Oct 14, 2025
c4e0e25
fix: remove duplicate nbytes declaration
Alvaro-Kothe Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 148 additions & 151 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
#include <float.h>
#include <math.h>
#include <stdbool.h>
#include <stdlib.h>

#include "pandas/portable.h"
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64

#define PROCESSED_WORD_CAPACITY 128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment here that we choose 128 because the Arrow256 allows up to 76 decimal digits, and this is the next highest power of 2?


void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
int64_t start) {
// column i, starting at 0
Expand Down Expand Up @@ -1834,201 +1837,195 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

/**
* @brief Check if the character in the pointer indicates a number.
* It expects that you consumed all leading whitespace.
*
* @param p_item Pointer to verify
* @return Non-zero integer indicating that has a digit 0 otherwise.
*/
static inline bool has_digit_int(const char *str) {
if (!str || *str == '\0') {
return false;
}

switch (*str) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return true;
case '+':
case '-':
return str[1] != '\0' && isdigit_ascii(str[1]);
default:
return false;
}
}

static inline bool has_only_spaces(const char *str) {
while (*str != '\0' && isspace_ascii(*str)) {
str++;
}
return *str == '\0';
}

/* Copy a string without `char_to_remove` into `output`.
*/
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for now just return -1 for error. Its a nice idea to use the define, but its inconsistent with all of the other error handling we have in our code base (and CPython standards). We don't want every function/module to implement its own custom error handling

const char *str, size_t str_len,
char char_to_remove) {
const char *left = str;
const char *right;
const char *end_ptr = str + str_len;
size_t bytes_read = 0;

while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) {
size_t nbytes = right - left;

// check if we have enough space, including the null terminator.
if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
return -1;
}
// copy block
memcpy(&output[bytes_read], left, nbytes);
bytes_read += nbytes;
left = right + 1;

// Exit after processing the entire string
if (left >= end_ptr) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this with the main loop invariant?

Copy link
Member Author

@Alvaro-Kothe Alvaro-Kothe Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It reaches there if the word ends with char_to_remove. I put it there to avoid uint underflow. I can put left < end_ptr in the loop invariant if you prefer.

break;
}
}

// copy final chunk that doesn't contain char_to_remove
if (end_ptr > left) {
size_t nbytes = end_ptr - left;
if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) {
return -1;
}
memcpy(&output[bytes_read], left, nbytes);
bytes_read += nbytes;
}

// null terminate
output[bytes_read] = '\0';
return 0;
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
if (!p_item || *p_item == '\0') {
*error = ERROR_NO_DIGITS;
return 0;
}

// Handle sign.
const bool isneg = *p == '-' ? true : false;
// Handle sign.
if (isneg || (*p == '+')) {
p++;
while (isspace_ascii(*p_item)) {
++p_item;
}

// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
if (!has_digit_int(p_item)) {
*error = ERROR_NO_DIGITS;
return 0;
}

int64_t number = 0;
if (isneg) {
// If number is greater than pre_min, at least one more digit
// can be processed without overflowing.
int dig_pre_min = -(int_min % 10);
int64_t pre_min = int_min / 10;

// Process the digits.
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
}
} else {
// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
int64_t pre_max = int_max / 10;
int dig_pre_max = int_max % 10;

// Process the digits.
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
errno = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check that the errno isn't set already before just clearing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem necessary, this is just parsing a new number and error handling is done in parsers.pyx

char buffer[PROCESSED_WORD_CAPACITY];
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use memchr here for consistency?

int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
if (status != 0) {
// Word is too big, probably will cause an overflow
*error = ERROR_OVERFLOW;
return 0;
}

p_item = buffer;
}

// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
if (errno == ERANGE) {
*error = ERROR_OVERFLOW;
return 0;
}

// Did we use up all the characters?
if (*p) {
char *endptr = NULL;
int64_t result = strtoll(p_item, &endptr, 10);

if (!has_only_spaces(endptr)) {
// Check first for invalid characters because we may
// want to skip integer parsing if we find one.
*error = ERROR_INVALID_CHARS;
return 0;
result = 0;
} else if (errno == ERANGE || result > int_max || result < int_min) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this is unrelated to your PR but why does this function accept int_max and int_min as arguments? Assuming those are actually set to INT64_MAX and INT64_MIN this is a no-op, and potentially waste of cycles if the compiler can't optimize it away

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, it's unnecessary. It's set to INT64_MAX and INT64_MIN.

data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)

Should I just check errno or I should also change the function signature?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the function signature is fine (let's do as a separate PR)

*error = ERROR_OVERFLOW;
result = 0;
} else {
*error = 0;
}

*error = 0;
return number;
return result;
}

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
if (!p_item || *p_item == '\0') {
*error = ERROR_NO_DIGITS;
return 0;
}

// Handle sign.
if (*p == '-') {
while (isspace_ascii(*p_item)) {
++p_item;
}

if (*p_item == '-') {
state->seen_sint = 1;
*error = 0;
return 0;
} else if (*p == '+') {
p++;
} else if (*p_item == '+') {
p_item++;
}

// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
if (!isdigit_ascii(*p_item)) {
*error = ERROR_NO_DIGITS;
return 0;
}

// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
//
// Process the digits.
uint64_t number = 0;
const uint64_t pre_max = uint_max / 10;
const uint64_t dig_pre_max = uint_max % 10;
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
errno = 0;
char buffer[PROCESSED_WORD_CAPACITY];
if (tsep != '\0' && strchr(p_item, tsep) != NULL) {
int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep);

} else {
*error = ERROR_OVERFLOW;
return 0;
}
if (status != 0) {
// Word is too big, probably will cause an overflow
*error = ERROR_OVERFLOW;
return 0;
}
p_item = buffer;
}

// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
}
char *endptr = NULL;
uint64_t result = strtoull(p_item, &endptr, 10);

// Did we use up all the characters?
if (*p) {
if (!has_only_spaces(endptr)) {
*error = ERROR_INVALID_CHARS;
return 0;
result = 0;
} else if (errno == ERANGE || result > uint_max) {
*error = ERROR_OVERFLOW;
result = 0;
} else {
*error = 0;
}

if (number > (uint64_t)int_max) {
if (result > (uint64_t)int_max) {
state->seen_uint = 1;
}

*error = 0;
return number;
return result;
}
18 changes: 14 additions & 4 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,23 @@ def test_read_csv_local(all_parsers, csv1):
tm.assert_frame_equal(result, expected)


def test_1000_sep(all_parsers):
@pytest.mark.parametrize(
"number_csv, expected_number",
[
("2,334", 2334),
("-2,334", -2334),
("-2,334,", -2334),
("2,,,,,,,,,,,,,,,5", 25),
("2,,3,4,,,,,,,,,,,,5", 2345),
],
)
def test_1000_sep(all_parsers, number_csv, expected_number):
parser = all_parsers
data = """A|B|C
1|2,334|5
data = f"""A|B|C
1|{number_csv}|5
10|13|10.
"""
expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
expected = DataFrame({"A": [1, 10], "B": [expected_number, 13], "C": [5, 10.0]})

if parser.engine == "pyarrow":
msg = "The 'thousands' option is not supported with the 'pyarrow' engine"
Expand Down
Loading