From 5912e823873ec83f48e2215b91a2d81a4b369b91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 18 Oct 2025 15:34:41 -0300 Subject: [PATCH] fix: handle overflow during exponent computation --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 48 +++++---------- pandas/tests/io/parser/common/test_float.py | 66 +++++++++++---------- 3 files changed, 51 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 42eeff68712ec..c8c6bd799f5e7 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1087,6 +1087,7 @@ I/O - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) +- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a5cfd0e13ceec..de1fc3c295f34 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1620,9 +1620,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } double number = 0.; - int exponent = 0; - int num_digits = 0; - int num_decimals = 0; + long int exponent = 0; + long int num_digits = 0; + long int num_decimals = 0; // Process string of digits. while (isdigit_ascii(*p)) { @@ -1671,39 +1671,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, if (maybe_int != NULL) *maybe_int = 0; - // Handle optional sign - negative = 0; - switch (*++p) { - case '-': - negative = 1; - PD_FALLTHROUGH; // Fall through to increment position. - case '+': - p++; - break; - } + // move past scientific notation + p++; - // Process string of digits. - num_digits = 0; - int n = 0; - while (num_digits < max_digits && isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); - num_digits++; - p++; - } + char *tmp_ptr; + long int n = strtol(p, &tmp_ptr, 10); - if (negative) - exponent -= n; - else - exponent += n; + if (errno == ERANGE || checked_add(exponent, n, &exponent)) { + errno = 0; + exponent = n; + } // If no digits after the 'e'/'E', un-consume it. - if (num_digits == 0) + if (tmp_ptr == p) p--; + else + p = tmp_ptr; } if (exponent > 308) { - *error = ERANGE; - return HUGE_VAL; + number = number == 0 ? 0 : number < 0 ? -HUGE_VAL : HUGE_VAL; } else if (exponent > 0) { number *= e[exponent]; } else if (exponent < -308) { // Subnormal @@ -1718,9 +1705,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, number /= e[-exponent]; } - if (number == HUGE_VAL || number == -HUGE_VAL) - *error = ERANGE; - if (skip_trailing) { // Skip trailing whitespace. while (isspace_ascii(*p)) @@ -1812,8 +1796,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci), *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; - else if (r == Py_HUGE_VAL) - *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); diff --git a/pandas/tests/io/parser/common/test_float.py b/pandas/tests/io/parser/common/test_float.py index 072294d34fb75..d711ee6053b41 100644 --- a/pandas/tests/io/parser/common/test_float.py +++ b/pandas/tests/io/parser/common/test_float.py @@ -8,15 +8,12 @@ import numpy as np import pytest -from pandas.compat import is_platform_linux - from pandas import DataFrame import pandas._testing as tm pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) -xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") @@ -42,40 +39,34 @@ def test_scientific_no_exponent(all_parsers_all_precisions): @pytest.mark.parametrize( - "neg_exp", + "value, expected_value", [ - -617, - -100000, - -99999999999999999, + ("0E-617", 0.0), + ("0E99999999", 0.0), + ("-0E99999999", 0.0), + ("-0E-99999999", 0.0), + ("10E-617", 0.0), + ("10E-100000", 0.0), + ("-10E-100000", 0.0), + ("10e-99999999999", 0.0), + ("10e-999999999999", 0.0), + ("10e-9999999999999", 0.0), + ("10E999", np.inf), + ("-10e99999999999", -np.inf), + ("10e99999999999", np.inf), + ("10e999999999999", np.inf), + ("10e9999999999999", np.inf), + ("50060e8007123400", np.inf), + ("-50060e8007123400", -np.inf), ], ) -def test_very_negative_exponent(all_parsers_all_precisions, neg_exp): - # GH#38753 +def test_large_exponent(all_parsers_all_precisions, value, expected_value): + # GH#38753; GH#38794; GH#62740 parser, precision = all_parsers_all_precisions - data = f"data\n10E{neg_exp}" + data = f"data\n{value}" result = parser.read_csv(StringIO(data), float_precision=precision) - expected = DataFrame({"data": [0.0]}) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different -@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999]) -def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request): - # GH#38753 - parser, precision = all_parsers_all_precisions - data = f"data\n10E{exp}" - result = parser.read_csv(StringIO(data), float_precision=precision) - if precision == "round_trip": - if exp == 999999999999999999 and is_platform_linux(): - mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result") - request.applymarker(mark) - - value = np.inf if exp > 0 else 0.0 - expected = DataFrame({"data": [value]}) - else: - expected = DataFrame({"data": [f"10E{exp}"]}) - + expected = DataFrame({"data": [expected_value]}) tm.assert_frame_equal(result, expected) @@ -104,3 +95,16 @@ def test_small_int_followed_by_float( expected = DataFrame({"data": [42.0, expected_value]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "value", ["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"] +) +def test_invalid_float_number(all_parsers_all_precisions, value): + # GH#62617 + parser, precision = all_parsers_all_precisions + data = f"h1,h2,h3\ndata1,{value},data3" + + result = parser.read_csv(StringIO(data), float_precision=precision) + expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"}) + tm.assert_frame_equal(result, expected)