diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 8c8a16af6bd34..ae4be412c056f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1079,6 +1079,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 91eddc3261164..ca87fce555f75 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -29,6 +29,7 @@ from cpython.exc cimport ( PyErr_Fetch, PyErr_Occurred, ) +from cpython.long cimport PyLong_FromString from cpython.object cimport PyObject from cpython.ref cimport ( Py_INCREF, @@ -1081,9 +1082,13 @@ cdef class TextReader: np.dtype("object"), i, start, end, 0, 0, na_hashset, na_fset) except OverflowError: - col_res, na_count = self._convert_with_dtype( - np.dtype("object"), i, start, end, na_filter, - 0, na_hashset, na_fset) + try: + col_res, na_count = _try_pylong(self.parser, i, start, + end, na_filter, na_hashset) + except ValueError: + col_res, na_count = self._convert_with_dtype( + np.dtype("object"), i, start, end, 0, + 0, na_hashset, na_fset) if col_res is not None: break @@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col, return 0 +cdef _try_pylong(parser_t *parser, Py_ssize_t col, + int64_t line_start, int64_t line_end, + bint na_filter, kh_str_starts_t *na_hashset): + cdef: + int na_count = 0 + Py_ssize_t lines + coliter_t it + const char *word = NULL + ndarray[object] result + object NA = na_values[np.object_] + + lines = line_end - line_start + result = np.empty(lines, dtype=object) + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + if na_filter and kh_get_str_starts_item(na_hashset, word): + # in the hash table + na_count += 1 + result[i] = NA + continue + + py_int = PyLong_FromString(word, NULL, 10) + if py_int is None: + raise ValueError("Invalid integer ", word) + result[i] = py_int + + return result, na_count + # -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 9322e8d54f5b8..87db540f6293a 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="parses to float64") request.applymarker(mark) + elif parser.engine == "python": + mark = pytest.mark.xfail( + reason="TODO: Python engine reads bigint as string" + ) + request.applymarker(mark) result = parser.read_csv(StringIO(data)) expected = DataFrame( [ - "00013007854817840016671868", - "00013007854817840016749251", - "00013007854817840016754630", - "00013007854817840016781876", - "00013007854817840017028824", - "00013007854817840017963235", - "00013007854817840018860166", + 13007854817840016671868, + 13007854817840016749251, + 13007854817840016754630, + 13007854817840016781876, + 13007854817840017028824, + 13007854817840017963235, + 13007854817840018860166, ], columns=["ID"], ) @@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request): ) def test_int64_uint64_range(all_parsers, val): # These numbers fall right inside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as integer. parser = all_parsers result = parser.read_csv(StringIO(str(val)), header=None) @@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val): @pytest.mark.parametrize( "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] ) -def test_outside_int64_uint64_range(all_parsers, val): +def test_outside_int64_uint64_range(all_parsers, val, request): # These numbers fall just outside the int64-uint64 - # range, so they should be parsed as string. + # range, so they should be parsed as object. parser = all_parsers + if parser.engine == "python": + mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string") + request.applymarker(mark) + result = parser.read_csv(StringIO(str(val)), header=None) - expected = DataFrame([str(val)]) + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@skip_pyarrow # CSV parse error: Empty CSV file or block +@pytest.mark.parametrize( + "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1] +) +def test_outside_int64_uint64_range_follow_str(all_parsers, val): + parser = all_parsers + + result = parser.read_csv(StringIO(f"{val}\nabc"), header=None) + + expected = DataFrame([str(val), "abc"]) tm.assert_frame_equal(result, expected)