-
-
Notifications
You must be signed in to change notification settings - Fork 19.1k
CLN: use integer parsing functions from stdlib #62658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
c3cc4a1
2459313
d8a454e
87789e6
2287944
2bea3c2
fb38679
f9ede5c
798c263
2f06f19
280b55e
d85aaf0
9046ecc
a4e2fb8
ef82cf4
5afeb11
0ef47a7
c840ef0
132342b
e6977cc
8120eea
1f5d506
479a2ab
c37c355
7d55283
ffe50ce
af5ad71
9211704
d026b01
d76ff5f
b523a19
6265172
abed6c1
8616f9f
c4e0e25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -23,10 +23,13 @@ GitHub. See Python Software Foundation License and BSD licenses for these. | |||||
#include <float.h> | ||||||
#include <math.h> | ||||||
#include <stdbool.h> | ||||||
#include <stdlib.h> | ||||||
|
||||||
#include "pandas/portable.h" | ||||||
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 | ||||||
|
||||||
#define PROCESSED_WORD_CAPACITY 128 | ||||||
|
||||||
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, | ||||||
int64_t start) { | ||||||
// column i, starting at 0 | ||||||
|
@@ -1834,201 +1837,195 @@ int uint64_conflict(uint_state *self) { | |||||
return self->seen_uint && (self->seen_sint || self->seen_null); | ||||||
} | ||||||
|
||||||
/** | ||||||
* @brief Check if the character in the pointer indicates a number. | ||||||
* It expects that you consumed all leading whitespace. | ||||||
* | ||||||
* @param p_item Pointer to verify | ||||||
* @return Non-zero integer indicating that has a digit 0 otherwise. | ||||||
*/ | ||||||
static inline bool has_digit_int(const char *str) { | ||||||
if (!str || *str == '\0') { | ||||||
return false; | ||||||
} | ||||||
|
||||||
switch (*str) { | ||||||
case '0': | ||||||
case '1': | ||||||
case '2': | ||||||
case '3': | ||||||
case '4': | ||||||
case '5': | ||||||
case '6': | ||||||
case '7': | ||||||
case '8': | ||||||
case '9': | ||||||
return true; | ||||||
case '+': | ||||||
case '-': | ||||||
return str[1] != '\0' && isdigit_ascii(str[1]); | ||||||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
default: | ||||||
return false; | ||||||
} | ||||||
} | ||||||
|
||||||
static inline bool has_only_spaces(const char *str) { | ||||||
while (*str != '\0' && isspace_ascii(*str)) { | ||||||
str++; | ||||||
} | ||||||
return *str == '\0'; | ||||||
} | ||||||
|
||||||
/* Copy a string without `char_to_remove` into `output`. | ||||||
*/ | ||||||
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think for now just return -1 for error. Its a nice idea to use the define, but its inconsistent with all of the other error handling we have in our code base (and CPython standards). We don't want every function/module to implement its own custom error handling |
||||||
const char *str, size_t str_len, | ||||||
char char_to_remove) { | ||||||
const char *left = str; | ||||||
const char *right; | ||||||
const char *end_ptr = str + str_len; | ||||||
size_t bytes_read = 0; | ||||||
|
||||||
while ((right = memchr(left, char_to_remove, str_len - bytes_read)) != NULL) { | ||||||
size_t nbytes = right - left; | ||||||
|
||||||
// check if we have enough space, including the null terminator. | ||||||
if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { | ||||||
return -1; | ||||||
} | ||||||
// copy block | ||||||
memcpy(&output[bytes_read], left, nbytes); | ||||||
bytes_read += nbytes; | ||||||
left = right + 1; | ||||||
|
||||||
// Exit after processing the entire string | ||||||
if (left >= end_ptr) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need this with the main loop invariant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It reaches there if the word ends with |
||||||
break; | ||||||
} | ||||||
} | ||||||
|
||||||
// copy final chunk that doesn't contain char_to_remove | ||||||
if (end_ptr > left) { | ||||||
size_t nbytes = end_ptr - left; | ||||||
if (nbytes + bytes_read >= PROCESSED_WORD_CAPACITY) { | ||||||
return -1; | ||||||
} | ||||||
memcpy(&output[bytes_read], left, nbytes); | ||||||
bytes_read += nbytes; | ||||||
} | ||||||
|
||||||
// null terminate | ||||||
output[bytes_read] = '\0'; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | ||||||
int *error, char tsep) { | ||||||
const char *p = p_item; | ||||||
// Skip leading spaces. | ||||||
while (isspace_ascii(*p)) { | ||||||
++p; | ||||||
if (!p_item || *p_item == '\0') { | ||||||
*error = ERROR_NO_DIGITS; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
// Handle sign. | ||||||
const bool isneg = *p == '-' ? true : false; | ||||||
// Handle sign. | ||||||
if (isneg || (*p == '+')) { | ||||||
p++; | ||||||
while (isspace_ascii(*p_item)) { | ||||||
++p_item; | ||||||
} | ||||||
|
||||||
// Check that there is a first digit. | ||||||
if (!isdigit_ascii(*p)) { | ||||||
// Error... | ||||||
if (!has_digit_int(p_item)) { | ||||||
*error = ERROR_NO_DIGITS; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
int64_t number = 0; | ||||||
if (isneg) { | ||||||
// If number is greater than pre_min, at least one more digit | ||||||
// can be processed without overflowing. | ||||||
int dig_pre_min = -(int_min % 10); | ||||||
int64_t pre_min = int_min / 10; | ||||||
|
||||||
// Process the digits. | ||||||
char d = *p; | ||||||
if (tsep != '\0') { | ||||||
while (1) { | ||||||
if (d == tsep) { | ||||||
d = *++p; | ||||||
continue; | ||||||
} else if (!isdigit_ascii(d)) { | ||||||
break; | ||||||
} | ||||||
if ((number > pre_min) || | ||||||
((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
number = number * 10 - (d - '0'); | ||||||
d = *++p; | ||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
} | ||||||
} else { | ||||||
while (isdigit_ascii(d)) { | ||||||
if ((number > pre_min) || | ||||||
((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
number = number * 10 - (d - '0'); | ||||||
d = *++p; | ||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
} | ||||||
} | ||||||
} else { | ||||||
// If number is less than pre_max, at least one more digit | ||||||
// can be processed without overflowing. | ||||||
int64_t pre_max = int_max / 10; | ||||||
int dig_pre_max = int_max % 10; | ||||||
|
||||||
// Process the digits. | ||||||
char d = *p; | ||||||
if (tsep != '\0') { | ||||||
while (1) { | ||||||
if (d == tsep) { | ||||||
d = *++p; | ||||||
continue; | ||||||
} else if (!isdigit_ascii(d)) { | ||||||
break; | ||||||
} | ||||||
if ((number < pre_max) || | ||||||
((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||||||
number = number * 10 + (d - '0'); | ||||||
d = *++p; | ||||||
|
||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
} | ||||||
} else { | ||||||
while (isdigit_ascii(d)) { | ||||||
if ((number < pre_max) || | ||||||
((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||||||
number = number * 10 + (d - '0'); | ||||||
d = *++p; | ||||||
errno = 0; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check that the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It doesn't seem necessary, this is just parsing a new number and error handling is done in |
||||||
char buffer[PROCESSED_WORD_CAPACITY]; | ||||||
if (tsep != '\0' && strchr(p_item, tsep) != NULL) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you use memchr here for consistency? |
||||||
int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); | ||||||
|
||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
} | ||||||
if (status != 0) { | ||||||
// Word is too big, probably will cause an overflow | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
p_item = buffer; | ||||||
} | ||||||
|
||||||
// Skip trailing spaces. | ||||||
while (isspace_ascii(*p)) { | ||||||
++p; | ||||||
if (errno == ERANGE) { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
// Did we use up all the characters? | ||||||
if (*p) { | ||||||
char *endptr = NULL; | ||||||
int64_t result = strtoll(p_item, &endptr, 10); | ||||||
|
||||||
if (!has_only_spaces(endptr)) { | ||||||
// Check first for invalid characters because we may | ||||||
// want to skip integer parsing if we find one. | ||||||
*error = ERROR_INVALID_CHARS; | ||||||
return 0; | ||||||
result = 0; | ||||||
} else if (errno == ERANGE || result > int_max || result < int_min) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose this is unrelated to your PR but why does this function accept There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed, it's unnecessary. It's set to pandas/pandas/_libs/parsers.pyx Lines 1930 to 1931 in 066a4f7
Should I just check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changing the function signature is fine (let's do as a separate PR) |
||||||
*error = ERROR_OVERFLOW; | ||||||
result = 0; | ||||||
} else { | ||||||
*error = 0; | ||||||
} | ||||||
|
||||||
*error = 0; | ||||||
return number; | ||||||
return result; | ||||||
} | ||||||
|
||||||
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, | ||||||
uint64_t uint_max, int *error, char tsep) { | ||||||
const char *p = p_item; | ||||||
// Skip leading spaces. | ||||||
while (isspace_ascii(*p)) { | ||||||
++p; | ||||||
if (!p_item || *p_item == '\0') { | ||||||
*error = ERROR_NO_DIGITS; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
// Handle sign. | ||||||
if (*p == '-') { | ||||||
while (isspace_ascii(*p_item)) { | ||||||
++p_item; | ||||||
} | ||||||
|
||||||
if (*p_item == '-') { | ||||||
state->seen_sint = 1; | ||||||
*error = 0; | ||||||
return 0; | ||||||
} else if (*p == '+') { | ||||||
p++; | ||||||
} else if (*p_item == '+') { | ||||||
p_item++; | ||||||
} | ||||||
|
||||||
// Check that there is a first digit. | ||||||
if (!isdigit_ascii(*p)) { | ||||||
// Error... | ||||||
if (!isdigit_ascii(*p_item)) { | ||||||
*error = ERROR_NO_DIGITS; | ||||||
return 0; | ||||||
} | ||||||
|
||||||
// If number is less than pre_max, at least one more digit | ||||||
// can be processed without overflowing. | ||||||
// | ||||||
// Process the digits. | ||||||
uint64_t number = 0; | ||||||
const uint64_t pre_max = uint_max / 10; | ||||||
const uint64_t dig_pre_max = uint_max % 10; | ||||||
char d = *p; | ||||||
if (tsep != '\0') { | ||||||
while (1) { | ||||||
if (d == tsep) { | ||||||
d = *++p; | ||||||
continue; | ||||||
} else if (!isdigit_ascii(d)) { | ||||||
break; | ||||||
} | ||||||
if ((number < pre_max) || | ||||||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { | ||||||
number = number * 10 + (d - '0'); | ||||||
d = *++p; | ||||||
|
||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
} | ||||||
} else { | ||||||
while (isdigit_ascii(d)) { | ||||||
if ((number < pre_max) || | ||||||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) { | ||||||
number = number * 10 + (d - '0'); | ||||||
d = *++p; | ||||||
errno = 0; | ||||||
char buffer[PROCESSED_WORD_CAPACITY]; | ||||||
if (tsep != '\0' && strchr(p_item, tsep) != NULL) { | ||||||
int status = copy_string_without_char(buffer, p_item, strlen(p_item), tsep); | ||||||
|
||||||
} else { | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
if (status != 0) { | ||||||
// Word is too big, probably will cause an overflow | ||||||
*error = ERROR_OVERFLOW; | ||||||
return 0; | ||||||
} | ||||||
p_item = buffer; | ||||||
} | ||||||
|
||||||
// Skip trailing spaces. | ||||||
while (isspace_ascii(*p)) { | ||||||
++p; | ||||||
} | ||||||
char *endptr = NULL; | ||||||
uint64_t result = strtoull(p_item, &endptr, 10); | ||||||
|
||||||
// Did we use up all the characters? | ||||||
if (*p) { | ||||||
if (!has_only_spaces(endptr)) { | ||||||
*error = ERROR_INVALID_CHARS; | ||||||
return 0; | ||||||
result = 0; | ||||||
} else if (errno == ERANGE || result > uint_max) { | ||||||
*error = ERROR_OVERFLOW; | ||||||
result = 0; | ||||||
} else { | ||||||
*error = 0; | ||||||
} | ||||||
|
||||||
if (number > (uint64_t)int_max) { | ||||||
if (result > (uint64_t)int_max) { | ||||||
state->seen_uint = 1; | ||||||
} | ||||||
|
||||||
*error = 0; | ||||||
return number; | ||||||
return result; | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a comment here that we choose 128 because the Arrow256 allows up to 76 decimal digits, and this is the next highest power of 2?