From 0d61b8dbb0e5de44b410e3a94a29e9bc8ea53779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 4 Oct 2025 18:24:35 -0300 Subject: [PATCH 1/6] fix(parser): integer overflow reads as PyLongObject --- pandas/_libs/parsers.pyx | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 91eddc3261164..20cc39f2760b6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -29,6 +29,7 @@ from cpython.exc cimport ( PyErr_Fetch, PyErr_Occurred, ) +from cpython.long cimport PyLong_FromString from cpython.object cimport PyObject from cpython.ref cimport ( Py_INCREF, @@ -1081,9 +1082,8 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, na_filter, - 0, na_hashset, na_fset) + col_res, na_count = _try_pylong(self.parser, i, start, + end, na_filter, na_hashset) if col_res is not None: break @@ -1873,6 +1873,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col, return 0 +cdef _try_pylong(parser_t *parser, Py_ssize_t col, + int64_t line_start, int64_t line_end, + bint na_filter, kh_str_starts_t *na_hashset): + cdef: + int na_count = 0 + Py_ssize_t lines + coliter_t it + const char *word = NULL + ndarray[object] result + object NA = na_values[np.object_] + + lines = line_end - line_start + result = np.empty(lines, dtype=object) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + if na_filter and kh_get_str_starts_item(na_hashset, word): + # in the hash table + na_count += 1 + result[i] = NA + continue + + py_int = PyLong_FromString(word, NULL, 10) + if py_int is None: + raise ValueError("Invalid integer ", word) + result[i] = py_int + + return result, na_count + # -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, From 1ff5245a6016bc15e247c93c539f9f5f413c5b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 4 Oct 2025 18:55:32 -0300 Subject: [PATCH 2/6] test(parser): update expected results for overflow tests --- pandas/tests/io/parser/common/test_ints.py | 31 ++++++++++++++-------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 9322e8d54f5b8..73b8536f8964e 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="parses to float64") request.applymarker(mark) + elif parser.engine == "python": + mark = pytest.mark.xfail( + reason="TODO: Python engine reads bigint as string" + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data)) expected = DataFrame( [ - "00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166", + 13007854817840016671868, + 13007854817840016749251, + 13007854817840016754630, + 13007854817840016781876, + 13007854817840017028824, + 13007854817840017963235, + 13007854817840018860166, ], columns=["ID"], ) @@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request): ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as integer. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) @@ -197,13 +202,17 @@ def test_int64_uint64_range(all_parsers, val): @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) -def test_outside_int64_uint64_range(all_parsers, val): +def test_outside_int64_uint64_range(all_parsers, val, request): # These numbers fall just outside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as object. parser = all_parsers + if parser.engine == "python": + mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string") + request.applymarker(mark) + result = parser.read_csv(StringIO(str(val)), header=None) - expected = DataFrame([str(val)]) + expected = DataFrame([val]) tm.assert_frame_equal(result, expected) From 6cfc8b84e535e89bd8ed4758a519b5802aa3699f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sat, 4 Oct 2025 19:01:18 -0300 Subject: [PATCH 3/6] docs(io): add entry in whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8c8a16af6bd34..ae4be412c056f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1079,6 +1079,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) From 3e1a991963568bf9bdedaba3577bef4c6cb66398 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 5 Oct 2025 09:27:28 -0300 Subject: [PATCH 4/6] test(parser): add test with overflow followed by str --- pandas/tests/io/parser/common/test_ints.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 73b8536f8964e..7380026cd5d96 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -216,6 +216,19 @@ def test_outside_int64_uint64_range(all_parsers, val, request): tm.assert_frame_equal(result, expected) +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range_follow_str(all_parsers, val, request): + parser = all_parsers + + result = parser.read_csv(StringIO(f"{val}\nabc"), header=None) + + expected = DataFrame([str(val), "abc"]) + tm.assert_frame_equal(result, expected) + + @xfail_pyarrow # gets float64 dtype instead of object @pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]]) def test_numeric_range_too_wide(all_parsers, exp_data): From 7366e644170f4269c907b66451f7ed5e68af555d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 5 Oct 2025 10:05:29 -0300 Subject: [PATCH 5/6] fix: handle string after overflow --- pandas/_libs/parsers.pyx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 20cc39f2760b6..ca87fce555f75 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1082,8 +1082,13 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - col_res, na_count = _try_pylong(self.parser, i, start, - end, na_filter, na_hashset) + try: + col_res, na_count = _try_pylong(self.parser, i, start, + end, na_filter, na_hashset) + except ValueError: + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset) if col_res is not None: break From e2083bdee4ab650d209986e929a98fd18e5f7aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 5 Oct 2025 10:08:38 -0300 Subject: [PATCH 6/6] chore: remove unused request --- pandas/tests/io/parser/common/test_ints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 7380026cd5d96..87db540f6293a 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -220,7 +220,7 @@ def test_outside_int64_uint64_range(all_parsers, val, request): @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) -def test_outside_int64_uint64_range_follow_str(all_parsers, val, request): +def test_outside_int64_uint64_range_follow_str(all_parsers, val): parser = all_parsers result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)