From c8c6e76b6ecd62ef0f1c44c68e60808284005ef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 23 Oct 2025 10:29:00 -0300 Subject: [PATCH 1/5] fix: allow read_csv with python engine read large integers as int --- pandas/_libs/lib.pyx | 16 +++++++++++++--- pandas/io/parsers/base_parser.py | 6 +++++- pandas/tests/io/parser/common/test_ints.py | 8 -------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8f124eed35a9c..72ea040b8360d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1386,6 +1386,7 @@ cdef class Seen: bint nan_ # seen_np.nan bint uint_ # seen_uint (unsigned integer) bint sint_ # seen_sint (signed integer) + bint overflow_ # seen_overflow bint float_ # seen_float bint object_ # seen_object bint complex_ # seen_complex @@ -1414,6 +1415,7 @@ cdef class Seen: self.nan_ = False self.uint_ = False self.sint_ = False + self.overflow_ = False self.float_ = False self.object_ = False self.complex_ = False @@ -2379,6 +2381,9 @@ def maybe_convert_numeric( ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY( 1, values.shape, cnp.NPY_UINT64, 0 ) + ndarray[object, ndim=1] pyints = cnp.PyArray_EMPTY( + 1, values.shape, cnp.NPY_OBJECT, 0 + ) ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY( 1, values.shape, cnp.NPY_UINT8, 0 ) @@ -2476,6 +2481,7 @@ def maybe_convert_numeric( if maybe_int: as_int = int(val) + pyints[i] = as_int if as_int in na_values: mask[i] = 1 @@ -2490,7 +2496,7 @@ def maybe_convert_numeric( if seen.coerce_numeric: seen.float_ = True else: - raise ValueError("Integer out of range.") + seen.overflow_ = True else: if as_int >= 0: uints[i] = as_int @@ -2529,11 +2535,15 @@ def maybe_convert_numeric( return (floats, None) elif seen.int_: if seen.null_ and convert_to_masked_nullable: - if seen.uint_: + if seen.overflow_: + return (pyints, mask.view(np.bool_)) + elif seen.uint_: return (uints, mask.view(np.bool_)) else: return (ints, mask.view(np.bool_)) - if seen.uint_: + if seen.overflow_: + return (pyints, None) + elif seen.uint_: return (uints, None) else: return (ints, None) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 1d96385a92cd3..a6a5a7c23b506 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -521,7 +521,11 @@ def _infer_types( if values.dtype == np.object_: na_count = parsers.sanitize_objects(values, na_values) - if result.dtype == np.object_ and try_num_bool: + if ( + result.dtype == np.object_ + and try_num_bool + and (len(result) == 0 or not isinstance(result[0], int)) + ): result, bool_mask = libops.maybe_convert_bool( np.asarray(values), true_values=self.true_values, diff --git a/pandas/tests/io/parser/common/test_ints.py b/pandas/tests/io/parser/common/test_ints.py index 87db540f6293a..9a98f99f80adc 100644 --- a/pandas/tests/io/parser/common/test_ints.py +++ b/pandas/tests/io/parser/common/test_ints.py @@ -144,11 +144,6 @@ def test_int64_overflow(all_parsers, conv, request): if parser.engine == "pyarrow": mark = pytest.mark.xfail(reason="parses to float64") request.applymarker(mark) - elif parser.engine == "python": - mark = pytest.mark.xfail( - reason="TODO: Python engine reads bigint as string" - ) - request.applymarker(mark) result = parser.read_csv(StringIO(data)) expected = DataFrame( @@ -206,9 +201,6 @@ def test_outside_int64_uint64_range(all_parsers, val, request): # These numbers fall just outside the int64-uint64 # range, so they should be parsed as object. parser = all_parsers - if parser.engine == "python": - mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string") - request.applymarker(mark) result = parser.read_csv(StringIO(str(val)), header=None) From b78877afc68cf19bfc481e298099f2a3de613241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 23 Oct 2025 21:42:31 -0300 Subject: [PATCH 2/5] fix(convert_numeric): allow direct conversion to python integer --- pandas/_libs/lib.pyx | 10 ++++++++-- pandas/tests/dtypes/test_inference.py | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 72ea040b8360d..f0da99c795ebf 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2426,18 +2426,24 @@ def maybe_convert_numeric( val = int(val) seen.saw_int(val) + pyints[i] = val if val >= 0: if val <= oUINT64_MAX: uints[i] = val - else: + elif seen.coerce_numeric: seen.float_ = True + else: + seen.overflow_ = True if oINT64_MIN <= val <= oINT64_MAX: ints[i] = val if val < oINT64_MIN or (seen.sint_ and seen.uint_): - seen.float_ = True + if seen.coerce_numeric: + seen.float_ = True + else: + seen.overflow_ = True elif util.is_bool_object(val): floats[i] = uints[i] = ints[i] = bools[i] = val diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 0c9770c572823..1efa60d1fbc4c 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -729,6 +729,26 @@ def test_convert_int_overflow(self, value): result = lib.maybe_convert_objects(arr) tm.assert_numpy_array_equal(arr, result) + @pytest.mark.parametrize( + "value, expected_value", + [ + (-(1 << 65), -(1 << 65)), + (1 << 65, 1 << 65), + (str(1 << 65), 1 << 65), + (f"-{1 << 65}", -(1 << 65)), + ], + ) + @pytest.mark.parametrize("coerce_numeric", [False, True]) + def test_convert_numeric_overflow(self, value, expected_value, coerce_numeric): + arr = np.array([value], dtype=object) + expected = np.array([expected_value], dtype=float if coerce_numeric else object) + result, _ = lib.maybe_convert_numeric( + arr, + set(), + coerce_numeric=coerce_numeric, + ) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("val", [None, np.nan, float("nan")]) @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) def test_maybe_convert_objects_nat_inference(self, val, dtype): From 655eba241e433851c96abc09ee40480107e93e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 23 Oct 2025 21:47:18 -0300 Subject: [PATCH 3/5] docs(whatsnew): update csv entry to include python bigint --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 259470a4f1513..dc81874967dde 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1091,7 +1091,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) -- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`) +- Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) From 47add339d9222d2ff60778ad49937424b264e6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 23 Oct 2025 22:33:22 -0300 Subject: [PATCH 4/5] test(to_numeric): change tested behavior for overflow --- pandas/tests/tools/test_to_numeric.py | 37 ++++++++------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 12e6be18244e1..fcbc91d4c632f 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -250,15 +250,9 @@ def test_really_large_scalar(large_val, signed, transform, errors): val = -large_val if signed else large_val val = transform(val) - val_is_string = isinstance(val, str) - if val_is_string and errors in (None, "raise"): - msg = "Integer out of range. at position 0" - with pytest.raises(ValueError, match=msg): - to_numeric(val, **kwargs) - else: - expected = float(val) if (errors == "coerce" and val_is_string) else val - tm.assert_almost_equal(to_numeric(val, **kwargs), expected) + expected = float(val) if errors == "coerce" else int(val) + tm.assert_almost_equal(to_numeric(val, **kwargs), expected) def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors): @@ -270,21 +264,17 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors extra_elt = "string" arr = [val] + multiple_elts * [extra_elt] - val_is_string = isinstance(val, str) coercing = errors == "coerce" - if errors in (None, "raise") and (val_is_string or multiple_elts): - if val_is_string: - msg = "Integer out of range. at position 0" - else: - msg = 'Unable to parse string "string" at position 1' + if errors in (None, "raise") and multiple_elts: + msg = 'Unable to parse string "string" at position 1' with pytest.raises(ValueError, match=msg): to_numeric(arr, **kwargs) else: result = to_numeric(arr, **kwargs) - exp_val = float(val) if (coercing and val_is_string) else val + exp_val = float(val) if (coercing) else int(val) expected = [exp_val] if multiple_elts: @@ -295,7 +285,7 @@ def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors expected.append(extra_elt) exp_dtype = object else: - exp_dtype = float if isinstance(exp_val, (int, float)) else object + exp_dtype = float if isinstance(exp_val, float) else object tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) @@ -311,18 +301,11 @@ def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors if multiple_elts: arr.insert(0, large_val) - if errors in (None, "raise"): - index = int(multiple_elts) - msg = f"Integer out of range. at position {index}" + result = to_numeric(arr, **kwargs) + expected = [float(i) if errors == "coerce" else int(i) for i in arr] + exp_dtype = float if errors == "coerce" else object - with pytest.raises(ValueError, match=msg): - to_numeric(arr, **kwargs) - else: - result = to_numeric(arr, **kwargs) - expected = [float(i) for i in arr] - exp_dtype = float - - tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) + tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype)) @pytest.mark.parametrize( From 597d471da86c60a8de64b7db944137795402908b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Thu, 23 Oct 2025 22:58:20 -0300 Subject: [PATCH 5/5] docs(whatsnew): add entry about change in `to_numeric` --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index dc81874967dde..7d80bb23c5fda 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -201,6 +201,7 @@ Other enhancements - :class:`Rolling` and :class:`Expanding` now support ``nunique`` (:issue:`26958`) - :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) +- :func:`to_numeric` on big integers converts to ``object`` datatype with python integers when not coercing. (:issue:`51295`) - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`)