-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
CLN: Consolidate decimal string parsing functions in tokenizer.c #62823
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7a2882b
4a6b66b
d2ce67d
7b0be60
4cfc5eb
4fbac54
05504cf
9de4b2a
9ff0af1
ea2c206
ee92b7b
b48f81b
e7fcb65
d32322f
1d97b3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1716,63 +1716,148 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, | |
| return number; | ||
| } | ||
|
|
||
| /* Helper functions and macros for string consumption and buffer management */ | ||
|
|
||
| static inline int str_consume_nspan(char **dst, size_t dst_sz, const char **src, | ||
| size_t src_sz, const char *charset) { | ||
| size_t span_sz = strspn(*src, charset); | ||
| if (span_sz > src_sz) { | ||
| span_sz = src_sz; | ||
| } | ||
| // Assuming the span size is within the expected range. | ||
| if (span_sz > INT_MAX) { | ||
| return -1; | ||
| } | ||
| if (dst) { | ||
| if (span_sz > dst_sz) { | ||
| return -1; | ||
| } | ||
| memcpy(*dst, *src, span_sz); | ||
| *dst += span_sz; | ||
| } | ||
| *src += span_sz; | ||
| return (int)span_sz; | ||
| } | ||
|
|
||
| static inline int str_consume_span(char **dst, size_t dst_sz, const char **src, | ||
| const char *charset) { | ||
| return str_consume_nspan(dst, dst_sz, src, SIZE_MAX, charset); | ||
| } | ||
|
|
||
| #define SKIP_SPAN(s, charset) str_consume_span(NULL, 0, &(s), (charset)) | ||
|
|
||
| #define SKIP_NSPAN(s, n, charset) \ | ||
| str_consume_nspan(NULL, 0, &(s), (n), (charset)) | ||
|
|
||
| #define SAFE_CONSUME_SPAN(d, de, s, charset) \ | ||
| do { \ | ||
| size_t _remaining = (de) - (d); \ | ||
| int _ret = str_consume_span(&(d), _remaining, &(s), (charset)); \ | ||
| if (_ret < 0) \ | ||
| return ERROR_OVERFLOW; \ | ||
| } while (0) | ||
|
|
||
| #define SAFE_CONSUME_NSPAN(d, de, s, n, charset) \ | ||
| do { \ | ||
| size_t _remaining = (de) - (d); \ | ||
| if (str_consume_nspan(&(d), _remaining, &(s), (n), (charset)) < 0) \ | ||
| return ERROR_OVERFLOW; \ | ||
| } while (0) | ||
|
|
||
| #define CHECK_BUFFER_SPACE(d, de) \ | ||
| do { \ | ||
| if ((d) >= (de)) \ | ||
| return ERROR_OVERFLOW; \ | ||
| } while (0) | ||
|
|
||
| /* copy a decimal number string with `decimal`, `tsep` as decimal point | ||
| and thousands separator to an equivalent c-locale decimal string (striping | ||
| `tsep`, replacing `decimal` with '.'). The returned memory should be free-d | ||
| with a call to `free`. | ||
| */ | ||
| `tsep`, replacing `decimal` with '.'). The result is written into `dst` | ||
| and null-terminated. */ | ||
|
|
||
| static int _str_copy_decimal_str_c(char *dst, size_t dst_sz, const char *src, | ||
| char **endpos, char decimal, char tsep) { | ||
| static const char *digits = "0123456789"; | ||
| static const char *exponents = "Ee"; | ||
| static const char *signs = "+-"; | ||
| static const char *whitespaces = " \t\n\v\f\r"; | ||
|
|
||
| const char decimals[] = {decimal, '\0'}; | ||
| const char tseps[] = {tsep, '\0'}; | ||
|
|
||
| const char *s = src; | ||
| char *d = dst; | ||
| const char *de = dst + dst_sz; | ||
| bool seen_digit = false; | ||
| int ret; | ||
|
|
||
| if (endpos != NULL) | ||
| *endpos = (char *)s; | ||
|
|
||
| // Skip leading whitespace | ||
| SKIP_SPAN(s, whitespaces); | ||
|
|
||
| // Copy leading sign (optional) | ||
| SAFE_CONSUME_NSPAN(d, de, s, 1, signs); | ||
|
|
||
| static char *_str_copy_decimal_str_c(const char *s, char **endpos, char decimal, | ||
| char tsep) { | ||
| const char *p = s; | ||
| const size_t length = strlen(s); | ||
| char *s_copy = malloc(length + 1); | ||
| char *dst = s_copy; | ||
| // Skip leading whitespace. | ||
| while (isspace_ascii(*p)) | ||
| p++; | ||
| // Copy Leading sign | ||
| if (*p == '+' || *p == '-') { | ||
| *dst++ = *p++; | ||
| } | ||
| // Copy integer part dropping `tsep` | ||
| while (isdigit_ascii(*p)) { | ||
| *dst++ = *p++; | ||
| p += (tsep != '\0' && *p == tsep); | ||
| while ((ret = str_consume_span(&d, de - d, &s, digits))) { | ||
| if (ret < 0) | ||
| return ERROR_OVERFLOW; | ||
| seen_digit = true; | ||
| SKIP_SPAN(s, tseps); | ||
| } | ||
|
|
||
| // Replace `decimal` with '.' | ||
| if (*p == decimal) { | ||
| *dst++ = '.'; | ||
| p++; | ||
| if (SKIP_NSPAN(s, 1, decimals)) { | ||
| CHECK_BUFFER_SPACE(d, de); | ||
| *d++ = '.'; | ||
| } | ||
|
|
||
| // Copy fractional part after decimal (if any) | ||
| while (isdigit_ascii(*p)) { | ||
| *dst++ = *p++; | ||
| if ((ret = str_consume_span(&d, de - d, &s, digits)) > 0) { | ||
| seen_digit = true; | ||
| } else if (ret < 0) { | ||
| return ERROR_OVERFLOW; | ||
| } | ||
|
|
||
| if (!seen_digit) { | ||
| // No digits found in integer or fractional part | ||
| return ERROR_NO_DIGITS; | ||
| } | ||
|
|
||
| // Copy exponent if any | ||
| if (toupper_ascii(*p) == toupper_ascii('E')) { | ||
| *dst++ = *p++; | ||
| // Copy leading exponent sign (if any) | ||
| if (*p == '+' || *p == '-') { | ||
| *dst++ = *p++; | ||
| } | ||
| // Copy exponent digits | ||
| while (isdigit_ascii(*p)) { | ||
| *dst++ = *p++; | ||
| } | ||
| if (decimal && (ret = str_consume_nspan(&d, de - d, &s, 1, exponents)) > 0) { | ||
| SAFE_CONSUME_NSPAN(d, de, s, 1, signs); | ||
| SAFE_CONSUME_SPAN(d, de, s, digits); | ||
| } else if (ret < 0) { | ||
| return ERROR_OVERFLOW; | ||
| } | ||
| *dst++ = '\0'; // terminate | ||
|
|
||
| // Skip trailing whitespace | ||
| SKIP_SPAN(s, whitespaces); | ||
|
|
||
| // Terminate string | ||
| CHECK_BUFFER_SPACE(d, de); | ||
| *d++ = '\0'; | ||
|
|
||
| // Did we use up all the characters? | ||
| if (*s) | ||
| return ERROR_INVALID_CHARS; | ||
|
|
||
| if (endpos != NULL) | ||
| *endpos = (char *)p; | ||
| return s_copy; | ||
| *endpos = (char *)s; | ||
|
|
||
| return 0; | ||
| } | ||
|
|
||
| double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), | ||
| char tsep, int skip_trailing, int *error, int *maybe_int) { | ||
| // 'normalize' representation to C-locale; replace decimal with '.' and | ||
| // remove thousands separator. | ||
| char pc[PROCESSED_WORD_CAPACITY]; | ||
| char *endptr; | ||
| char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); | ||
| _str_copy_decimal_str_c(pc, sizeof(pc), p, &endptr, decimal, tsep); | ||
| // This is called from a nogil block in parsers.pyx | ||
| // so need to explicitly get GIL before Python calls | ||
| PyGILState_STATE gstate = PyGILState_Ensure(); | ||
|
|
@@ -1799,7 +1884,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), | |
| PyErr_Clear(); | ||
|
|
||
| PyGILState_Release(gstate); | ||
| free(pc); | ||
| if (skip_trailing && q != NULL && *q != p) { | ||
| while (isspace_ascii(**q)) { | ||
| (*q)++; | ||
|
|
@@ -1821,150 +1905,53 @@ int uint64_conflict(uint_state *self) { | |
| return self->seen_uint && (self->seen_sint || self->seen_null); | ||
| } | ||
|
|
||
| /* Copy a string without `char_to_remove` into `output`. | ||
| */ | ||
| static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], | ||
| const char *str, size_t str_len, | ||
| char char_to_remove) { | ||
| const char *left = str; | ||
| const char *end_ptr = str + str_len; | ||
| size_t bytes_written = 0; | ||
|
|
||
| while (left < end_ptr) { | ||
| const size_t remaining_bytes_to_read = end_ptr - left; | ||
| const char *right = memchr(left, char_to_remove, remaining_bytes_to_read); | ||
|
|
||
| if (!right) { | ||
| // If it doesn't find the char to remove, just copy until EOS. | ||
| right = end_ptr; | ||
| } | ||
|
|
||
| const size_t chunk_size = right - left; | ||
|
|
||
| if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) { | ||
| return -1; | ||
| } | ||
| memcpy(&output[bytes_written], left, chunk_size); | ||
| bytes_written += chunk_size; | ||
|
|
||
| left = right + 1; | ||
| } | ||
|
|
||
| output[bytes_written] = '\0'; | ||
| return 0; | ||
| } | ||
|
|
||
| int64_t str_to_int64(const char *p_item, int *error, char tsep) { | ||
| const char *p = p_item; | ||
| // Skip leading spaces. | ||
| while (isspace_ascii(*p)) { | ||
| ++p; | ||
| } | ||
|
|
||
| // Handle sign. | ||
| const bool has_sign = *p == '-' || *p == '+'; | ||
| // Handle sign. | ||
| const char *digit_start = has_sign ? p + 1 : p; | ||
|
|
||
| // Check that there is a first digit. | ||
| if (!isdigit_ascii(*digit_start)) { | ||
| // Error... | ||
| *error = ERROR_NO_DIGITS; | ||
| return 0; | ||
| } | ||
|
|
||
| char buffer[PROCESSED_WORD_CAPACITY]; | ||
| const size_t str_len = strlen(p); | ||
| if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { | ||
| const int status = copy_string_without_char(buffer, p, str_len, tsep); | ||
| if (status != 0) { | ||
| // Word is too big, probably will cause an overflow | ||
| *error = ERROR_OVERFLOW; | ||
| return 0; | ||
| } | ||
| p = buffer; | ||
| char *endptr; | ||
| int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, in the integer parsing functions, it only processed the word if it identified the necessity for it by checking for the presence of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the performance of code you said would be better. However, even if there is no If I have to, I also think about that maybe it would be better to use So instead of sacrificing a little bit of performance, I think it is reasonable the current version of consolidating and simplifying logic. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. However, I'm not an expert, so I’d appreciate your advice. If you still think it’s better to make the change even after considering what I said, I’ll follow your recommendation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I honestly don't see much problem in having the repeated logic of dealing with leading and trailing whitespace as it was before.
Considering the goal of the issue was to create a superset of this function, I think it's better to use the function that you are rewriting.
This is a valid point. I am ambivalent about this. I would wait for @WillAyd opinion on this topic. |
||
| &endptr, '\0', tsep); | ||
| if (status != 0) { | ||
| *error = status; | ||
| return 0; | ||
| } | ||
|
|
||
| char *endptr; | ||
| int64_t number = strtoll(p, &endptr, 10); | ||
| int64_t number = strtoll(buffer, &endptr, 10); | ||
|
|
||
| if (errno == ERANGE) { | ||
| *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW; | ||
| errno = 0; | ||
| return 0; | ||
| } | ||
|
|
||
| // Skip trailing spaces. | ||
| while (isspace_ascii(*endptr)) { | ||
| ++endptr; | ||
| } | ||
|
|
||
| // Did we use up all the characters? | ||
| if (*endptr) { | ||
| *error = ERROR_INVALID_CHARS; | ||
| return 0; | ||
| } | ||
|
|
||
| *error = 0; | ||
| return number; | ||
| } | ||
|
|
||
| uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error, | ||
| char tsep) { | ||
| const char *p = p_item; | ||
| // Skip leading spaces. | ||
| while (isspace_ascii(*p)) { | ||
| ++p; | ||
| char buffer[PROCESSED_WORD_CAPACITY]; | ||
| char *endptr; | ||
| int status = _str_copy_decimal_str_c(buffer, PROCESSED_WORD_CAPACITY, p_item, | ||
| &endptr, '\0', tsep); | ||
| if (status != 0) { | ||
| *error = status; | ||
| return 0; | ||
| } | ||
|
|
||
| // Handle sign. | ||
| if (*p == '-') { | ||
| if (buffer[0] == '-') { | ||
| state->seen_sint = 1; | ||
| *error = 0; | ||
| return 0; | ||
| } else if (*p == '+') { | ||
| p++; | ||
| } | ||
|
|
||
| // Check that there is a first digit. | ||
| if (!isdigit_ascii(*p)) { | ||
| // Error... | ||
| *error = ERROR_NO_DIGITS; | ||
| return 0; | ||
| } | ||
|
|
||
| char buffer[PROCESSED_WORD_CAPACITY]; | ||
| const size_t str_len = strlen(p); | ||
| if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) { | ||
| const int status = copy_string_without_char(buffer, p, str_len, tsep); | ||
| if (status != 0) { | ||
| // Word is too big, probably will cause an overflow | ||
| *error = ERROR_OVERFLOW; | ||
| return 0; | ||
| } | ||
| p = buffer; | ||
| } | ||
|
|
||
| char *endptr; | ||
| uint64_t number = strtoull(p, &endptr, 10); | ||
| uint64_t number = strtoull(buffer, &endptr, 10); | ||
|
|
||
| if (errno == ERANGE) { | ||
| *error = *endptr ? ERROR_INVALID_CHARS : ERROR_OVERFLOW; | ||
| errno = 0; | ||
| return 0; | ||
| } | ||
|
|
||
| // Skip trailing spaces. | ||
| while (isspace_ascii(*endptr)) { | ||
| ++endptr; | ||
| } | ||
|
|
||
| // Did we use up all the characters? | ||
| if (*endptr) { | ||
| *error = ERROR_INVALID_CHARS; | ||
| return 0; | ||
| } | ||
|
|
||
| if (number > (uint64_t)INT64_MAX) { | ||
| state->seen_uint = 1; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This overall seems pretty complicated - what didn't work with
strtok?