Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="c"`` reading big integers as strings. Now reads them as python integers. (:issue:`51295`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
Expand Down
41 changes: 38 additions & 3 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ from cpython.exc cimport (
PyErr_Fetch,
PyErr_Occurred,
)
from cpython.long cimport PyLong_FromString
from cpython.object cimport PyObject
from cpython.ref cimport (
Py_INCREF,
Expand Down Expand Up @@ -1081,9 +1082,13 @@ cdef class TextReader:
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
except OverflowError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, na_filter,
0, na_hashset, na_fset)
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)

if col_res is not None:
break
Expand Down Expand Up @@ -1873,6 +1878,36 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,

return 0

cdef _try_pylong(parser_t *parser, Py_ssize_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
cdef:
int na_count = 0
Py_ssize_t lines
coliter_t it
const char *word = NULL
ndarray[object] result
object NA = na_values[np.object_]

lines = line_end - line_start
result = np.empty(lines, dtype=object)
coliter_setup(&it, parser, col, line_start)

for i in range(lines):
COLITER_NEXT(it, word)
if na_filter and kh_get_str_starts_item(na_hashset, word):
# in the hash table
na_count += 1
result[i] = NA
continue

py_int = PyLong_FromString(word, NULL, 10)
if py_int is None:
raise ValueError("Invalid integer ", word)
result[i] = py_int

return result, na_count


# -> tuple[ndarray[bool], int]
cdef _try_bool_flex(parser_t *parser, int64_t col,
Expand Down
44 changes: 33 additions & 11 deletions pandas/tests/io/parser/common/test_ints.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,22 @@ def test_int64_overflow(all_parsers, conv, request):
if parser.engine == "pyarrow":
mark = pytest.mark.xfail(reason="parses to float64")
request.applymarker(mark)
elif parser.engine == "python":
mark = pytest.mark.xfail(
reason="TODO: Python engine reads bigint as string"
)
request.applymarker(mark)

result = parser.read_csv(StringIO(data))
expected = DataFrame(
[
"00013007854817840016671868",
"00013007854817840016749251",
"00013007854817840016754630",
"00013007854817840016781876",
"00013007854817840017028824",
"00013007854817840017963235",
"00013007854817840018860166",
13007854817840016671868,
13007854817840016749251,
13007854817840016754630,
13007854817840016781876,
13007854817840017028824,
13007854817840017963235,
13007854817840018860166,
],
columns=["ID"],
)
Expand Down Expand Up @@ -185,7 +190,7 @@ def test_int64_overflow(all_parsers, conv, request):
)
def test_int64_uint64_range(all_parsers, val):
# These numbers fall right inside the int64-uint64
# range, so they should be parsed as string.
# range, so they should be parsed as integer.
parser = all_parsers
result = parser.read_csv(StringIO(str(val)), header=None)

Expand All @@ -197,13 +202,30 @@ def test_int64_uint64_range(all_parsers, val):
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
)
def test_outside_int64_uint64_range(all_parsers, val):
def test_outside_int64_uint64_range(all_parsers, val, request):
# These numbers fall just outside the int64-uint64
# range, so they should be parsed as string.
# range, so they should be parsed as object.
parser = all_parsers
if parser.engine == "python":
mark = pytest.mark.xfail(reason="TODO: Python engine reads bigint as string")
request.applymarker(mark)

result = parser.read_csv(StringIO(str(val)), header=None)

expected = DataFrame([str(val)])
expected = DataFrame([val])
tm.assert_frame_equal(result, expected)


@skip_pyarrow # CSV parse error: Empty CSV file or block
@pytest.mark.parametrize(
"val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
)
def test_outside_int64_uint64_range_follow_str(all_parsers, val):
parser = all_parsers

result = parser.read_csv(StringIO(f"{val}\nabc"), header=None)

expected = DataFrame([str(val), "abc"])
tm.assert_frame_equal(result, expected)


Expand Down
Loading