Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 139 additions & 152 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1716,63 +1716,148 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
return number;
}

/* Helper functions and macros for string consumption and buffer management */

static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src,
size_t src_sz, const char *charset) {
size_t span_sz = strspn(*src, charset);
if (span_sz > src_sz) {
span_sz = src_sz;
}
// Assuming the span size is within the expected range.
if (span_sz > INT_MAX) {
return -1;
}
if (dst) {
if (span_sz > dst_sz) {
return -1;
}
memcpy(*dst, *src, span_sz);
*dst += span_sz;
}
*src += span_sz;
return (int)span_sz;
}

static inline int str_consume_span(char **dst, size_t dst_sz, const char **src,
const char *charset) {
return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset);
}

#define SKIP_SPAN(s, charset) str_consume_span(NULL, 0, &(s), (charset))

#define SKIP_NSPAN(s, n, charset) \
str_consume_nspan(NULL, 0, &(s), (n), (charset))

#define SAFE_CONSUME_SPAN(d, de, s, charset) \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This overall seems pretty complicated - what didn't work with strtok?

do { \
size_t _remaining = (de) - (d); \
int _ret = str_consume_span(&(d), _remaining, &(s), (charset)); \
if (_ret < 0) \
return ERROR_OVERFLOW; \
} while (0)

#define SAFE_CONSUME_NSPAN(d, de, s, n, charset) \
do { \
size_t _remaining = (de) - (d); \
if (str_consume_nspan(&(d), _remaining, &(s), (n), (charset)) < 0) \
return ERROR_OVERFLOW; \
} while (0)

#define CHECK_BUFFER_SPACE(d, de) \
do { \
if ((d) >= (de)) \
return ERROR_OVERFLOW; \
} while (0)

/* copy a decimal number string with `decimal`, `tsep` as decimal point
and thousands separator to an equivalent c-locale decimal string (striping
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
with a call to `free`.
*/
`tsep`, replacing `decimal` with '.'). The result is written into `dst`
and null-terminated. */

static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src,
char **endpos, char decimal, char tsep) {
static const char *digits = "0123456789";
static const char *exponents = "Ee";
static const char *signs = "+-";
static const char *whitespaces = " \t\n\v\f\r";

const char decimals[] = {decimal, '\0'};
const char tseps[] = {tsep, '\0'};

const char *s = src;
char *d = dst;
const char *de = dst + dst_sz;
bool seen_digit = false;
int ret;

if (endpos != NULL)
*endpos = (char *)s;

// Skip leading whitespace
SKIP_SPAN(s, whitespaces);

// Copy leading sign (optional)
SAFE_CONSUME_NSPAN(d, de, s, 1, signs);

static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
char tsep) {
const char *p = s;
const size_t length = strlen(s);
char *s_copy = malloc(length + 1);
char *dst = s_copy;
// Skip leading whitespace.
while (isspace_ascii(*p))
p++;
// Copy Leading sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy integer part dropping `tsep`
while (isdigit_ascii(*p)) {
*dst++ = *p++;
p += (tsep != '\0' && *p == tsep);
while ((ret = str_consume_span(&d, de - d, &s, digits))) {
if (ret < 0)
return ERROR_OVERFLOW;
seen_digit = true;
SKIP_SPAN(s, tseps);
}

// Replace `decimal` with '.'
if (*p == decimal) {
*dst++ = '.';
p++;
if (SKIP_NSPAN(s, 1, decimals)) {
CHECK_BUFFER_SPACE(d, de);
*d++ = '.';
}

// Copy fractional part after decimal (if any)
while (isdigit_ascii(*p)) {
*dst++ = *p++;
if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) {
seen_digit = true;
} else if (ret < 0) {
return ERROR_OVERFLOW;
}

if (!seen_digit) {
// No digits found in integer or fractional part
return ERROR_NO_DIGITS;
}

// Copy exponent if any
if (toupper_ascii(*p) == toupper_ascii('E')) {
*dst++ = *p++;
// Copy leading exponent sign (if any)
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy exponent digits
while (isdigit_ascii(*p)) {
*dst++ = *p++;
}
if (decimal && (ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) {
SAFE_CONSUME_NSPAN(d, de, s, 1, signs);
SAFE_CONSUME_SPAN(d, de, s, digits);
} else if (ret < 0) {
return ERROR_OVERFLOW;
}
*dst++ = '\0'; // terminate

// Skip trailing whitespace
SKIP_SPAN(s, whitespaces);

// Terminate string
CHECK_BUFFER_SPACE(d, de);
*d++ = '\0';

// Did we use up all the characters?
if (*s)
return ERROR_INVALID_CHARS;

if (endpos != NULL)
*endpos = (char *)p;
return s_copy;
*endpos = (char *)s;

return 0;
}

double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
char tsep, int skip_trailing, int *error, int *maybe_int) {
// 'normalize' representation to C-locale; replace decimal with '.' and
// remove thousands separator.
char pc[PROCESSED_WORD_CAPACITY];
char *endptr;
char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
_str_copy_decimal_str_c(pc, sizeof(pc), p, &endptr, decimal, tsep);
// This is called from a nogil block in parsers.pyx
// so need to explicitly get GIL before Python calls
PyGILState_STATE gstate = PyGILState_Ensure();
Expand All @@ -1799,7 +1884,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
PyErr_Clear();

PyGILState_Release(gstate);
free(pc);
if (skip_trailing && q != NULL && *q != p) {
while (isspace_ascii(**q)) {
(*q)++;
Expand All @@ -1821,150 +1905,53 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

/* Copy a string without `char_to_remove` into `output`.
*/
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
const char *str, size_t str_len,
char char_to_remove) {
const char *left = str;
const char *end_ptr = str + str_len;
size_t bytes_written = 0;

while (left < end_ptr) {
const size_t remaining_bytes_to_read = end_ptr - left;
const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);

if (!right) {
// If it doesn't find the char to remove, just copy until EOS.
right = end_ptr;
}

const size_t chunk_size = right - left;

if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
return -1;
}
memcpy(&output[bytes_written], left, chunk_size);
bytes_written += chunk_size;

left = right + 1;
}

output[bytes_written] = '\0';
return 0;
}

int64_t str_to_int64(const char *p_item, int *error, char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
}

// Handle sign.
const bool has_sign = *p == '-' || *p == '+';
// Handle sign.
const char *digit_start = has_sign ? p + 1 : p;

// Check that there is a first digit.
if (!isdigit_ascii(*digit_start)) {
// Error...
*error = ERROR_NO_DIGITS;
return 0;
}

char buffer[PROCESSED_WORD_CAPACITY];
const size_t str_len = strlen(p);
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
const int status = copy_string_without_char(buffer, p, str_len, tsep);
if (status != 0) {
// Word is too big, probably will cause an overflow
*error = ERROR_OVERFLOW;
return 0;
}
p = buffer;
char *endptr;
int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, in the integer parsing functions, it only processed the word if it identified the necessity for it by checking for the presence of tsep. Now you are always doing it. For me, this doesn't seem necessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the performance of code you said would be better.

However, even if there is no tsep, processing such as skipping whitespaces is necessary, so various string processing logic before and after must be reverted as before.

If I have to, I also think about that maybe it would be better to use copy_string_without_char like before.

So instead of sacrificing a little bit of performance, I think it is reasonable the current version of consolidating and simplifying logic.

Copy link
Contributor Author

@heoh heoh Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, I'm not an expert, so I’d appreciate your advice. If you still think it’s better to make the change even after considering what I said, I’ll follow your recommendation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, even if there is no tsep, processing such as skipping whitespaces is necessary,

I honestly don't see much problem in having the repeated logic of dealing with leading and trailing whitespace as it was before.

If I have to, I also think about that maybe it would be better to use copy_string_without_char like before.

Considering the goal of the issue was to create a superset of this function, I think it's better to use the function that you are rewriting.

So instead of sacrificing a little bit of performance, I think it is reasonable the current version of consolidating and simplifying logic.

This is a valid point. I am ambivalent about this. I would wait for @WillAyd opinion on this topic.

&endptr, '\0', tsep);
if (status != 0) {
*error = status;
return 0;
}

char *endptr;
int64_t number = strtoll(p, &endptr, 10);
int64_t number = strtoll(buffer, &endptr, 10);

if (errno == ERANGE) {
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
errno = 0;
return 0;
}

// Skip trailing spaces.
while (isspace_ascii(*endptr)) {
++endptr;
}

// Did we use up all the characters?
if (*endptr) {
*error = ERROR_INVALID_CHARS;
return 0;
}

*error = 0;
return number;
}

uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
char buffer[PROCESSED_WORD_CAPACITY];
char *endptr;
int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item,
&endptr, '\0', tsep);
if (status != 0) {
*error = status;
return 0;
}

// Handle sign.
if (*p == '-') {
if (buffer[0] == '-') {
state->seen_sint = 1;
*error = 0;
return 0;
} else if (*p == '+') {
p++;
}

// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
*error = ERROR_NO_DIGITS;
return 0;
}

char buffer[PROCESSED_WORD_CAPACITY];
const size_t str_len = strlen(p);
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
const int status = copy_string_without_char(buffer, p, str_len, tsep);
if (status != 0) {
// Word is too big, probably will cause an overflow
*error = ERROR_OVERFLOW;
return 0;
}
p = buffer;
}

char *endptr;
uint64_t number = strtoull(p, &endptr, 10);
uint64_t number = strtoull(buffer, &endptr, 10);

if (errno == ERANGE) {
*error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW;
errno = 0;
return 0;
}

// Skip trailing spaces.
while (isspace_ascii(*endptr)) {
++endptr;
}

// Did we use up all the characters?
if (*endptr) {
*error = ERROR_INVALID_CHARS;
return 0;
}

if (number > (uint64_t)INT64_MAX) {
state->seen_uint = 1;
}
Expand Down
Loading