Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,7 @@ I/O
- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
- Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`)
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
- Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
Expand Down
48 changes: 15 additions & 33 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1620,9 +1620,9 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
}

double number = 0.;
int exponent = 0;
int num_digits = 0;
int num_decimals = 0;
long int exponent = 0;
long int num_digits = 0;
long int num_decimals = 0;

// Process string of digits.
while (isdigit_ascii(*p)) {
Expand Down Expand Up @@ -1671,39 +1671,26 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
if (maybe_int != NULL)
*maybe_int = 0;

// Handle optional sign
negative = 0;
switch (*++p) {
case '-':
negative = 1;
PD_FALLTHROUGH; // Fall through to increment position.
case '+':
p++;
break;
}
// move past scientific notation
p++;

// Process string of digits.
num_digits = 0;
int n = 0;
while (num_digits < max_digits && isdigit_ascii(*p)) {
n = n * 10 + (*p - '0');
num_digits++;
p++;
}
char *tmp_ptr;
long int n = strtol(p, &tmp_ptr, 10);

if (negative)
exponent -= n;
else
exponent += n;
if (errno == ERANGE || checked_add(exponent, n, &exponent)) {
errno = 0;
exponent = n;
}

// If no digits after the 'e'/'E', un-consume it.
if (num_digits == 0)
if (tmp_ptr == p)
p--;
else
p = tmp_ptr;
}

if (exponent > 308) {
*error = ERANGE;
return HUGE_VAL;
number = number == 0 ? 0 : number < 0 ? -HUGE_VAL : HUGE_VAL;
} else if (exponent > 0) {
number *= e[exponent];
} else if (exponent < -308) { // Subnormal
Expand All @@ -1718,9 +1705,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci,
number /= e[-exponent];
}

if (number == HUGE_VAL || number == -HUGE_VAL)
*error = ERANGE;

if (skip_trailing) {
// Skip trailing whitespace.
while (isspace_ascii(*p))
Expand Down Expand Up @@ -1812,8 +1796,6 @@ double round_trip(const char *p, char **q, char decimal, char Py_UNUSED(sci),
*maybe_int = 0;
if (PyErr_Occurred() != NULL)
*error = -1;
else if (r == Py_HUGE_VAL)
*error = (int)Py_HUGE_VAL;
PyErr_Clear();

PyGILState_Release(gstate);
Expand Down
66 changes: 35 additions & 31 deletions pandas/tests/io/parser/common/test_float.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@
import numpy as np
import pytest

from pandas.compat import is_platform_linux

from pandas import DataFrame
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")


Expand All @@ -42,40 +39,34 @@ def test_scientific_no_exponent(all_parsers_all_precisions):


@pytest.mark.parametrize(
"neg_exp",
"value, expected_value",
[
-617,
-100000,
-99999999999999999,
("0E-617", 0.0),
("0E99999999", 0.0),
("-0E99999999", 0.0),
("-0E-99999999", 0.0),
("10E-617", 0.0),
("10E-100000", 0.0),
("-10E-100000", 0.0),
("10e-99999999999", 0.0),
("10e-999999999999", 0.0),
("10e-9999999999999", 0.0),
("10E999", np.inf),
("-10e99999999999", -np.inf),
("10e99999999999", np.inf),
("10e999999999999", np.inf),
("10e9999999999999", np.inf),
("50060e8007123400", np.inf),
("-50060e8007123400", -np.inf),
],
)
def test_very_negative_exponent(all_parsers_all_precisions, neg_exp):
# GH#38753
def test_large_exponent(all_parsers_all_precisions, value, expected_value):
# GH#38753; GH#38794; GH#62740
parser, precision = all_parsers_all_precisions

data = f"data\n10E{neg_exp}"
data = f"data\n{value}"
result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"data": [0.0]})
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # AssertionError: Attributes of DataFrame.iloc[:, 0] are different
@pytest.mark.parametrize("exp", [999999999999999999, -999999999999999999])
def test_too_many_exponent_digits(all_parsers_all_precisions, exp, request):
# GH#38753
parser, precision = all_parsers_all_precisions
data = f"data\n10E{exp}"
result = parser.read_csv(StringIO(data), float_precision=precision)
if precision == "round_trip":
if exp == 999999999999999999 and is_platform_linux():
mark = pytest.mark.xfail(reason="GH38794, on Linux gives object result")
request.applymarker(mark)

value = np.inf if exp > 0 else 0.0
expected = DataFrame({"data": [value]})
else:
expected = DataFrame({"data": [f"10E{exp}"]})

expected = DataFrame({"data": [expected_value]})
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -104,3 +95,16 @@ def test_small_int_followed_by_float(
expected = DataFrame({"data": [42.0, expected_value]})

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"value", ["81e31d04049863b72", "d81e31d04049863b72", "81e3104049863b72"]
)
def test_invalid_float_number(all_parsers_all_precisions, value):
# GH#62617
parser, precision = all_parsers_all_precisions
data = f"h1,h2,h3\ndata1,{value},data3"

result = parser.read_csv(StringIO(data), float_precision=precision)
expected = DataFrame({"h1": ["data1"], "h2": [value], "h3": "data3"})
tm.assert_frame_equal(result, expected)
Loading