Skip to content

Commit 986b4e5

Browse files
authored
BUG: always warn when on_bad_lines callable returns extra fields with index_col in read_csv (Python engine) (GH#61837) (#62297)
1 parent 3c1d868 commit 986b4e5

File tree

3 files changed

+55
-9
lines changed

3 files changed

+55
-9
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,8 @@ MultiIndex
10541054
I/O
10551055
^^^
10561056
- Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
1057+
- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
1058+
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
10571059
- Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
10581060
- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
10591061
- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)

pandas/io/parsers/python_parser.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import numpy as np
2222

2323
from pandas._libs import lib
24+
from pandas._typing import Scalar
2425
from pandas.errors import (
2526
EmptyDataError,
2627
ParserError,
@@ -77,7 +78,6 @@
7778
ArrayLike,
7879
DtypeObj,
7980
ReadCsvBuffer,
80-
Scalar,
8181
T,
8282
)
8383

@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
954954
"""
955955
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
956956
raise ParserError(msg)
957-
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
957+
if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
958+
self.on_bad_lines
959+
):
958960
warnings.warn(
959961
f"Skipping line {row_num}: {msg}\n",
960962
ParserWarning,
@@ -1189,29 +1191,35 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11891191

11901192
for i, _content in iter_content:
11911193
actual_len = len(_content)
1192-
11931194
if actual_len > col_len:
11941195
if callable(self.on_bad_lines):
11951196
new_l = self.on_bad_lines(_content)
11961197
if new_l is not None:
1197-
content.append(new_l) # pyright: ignore[reportArgumentType]
1198+
new_l = cast(list[Scalar], new_l)
1199+
if len(new_l) > col_len:
1200+
row_num = self.pos - (content_len - i + footers)
1201+
bad_lines.append((row_num, len(new_l), "callable"))
1202+
new_l = new_l[:col_len]
1203+
content.append(new_l)
1204+
11981205
elif self.on_bad_lines in (
11991206
self.BadLineHandleMethod.ERROR,
12001207
self.BadLineHandleMethod.WARN,
12011208
):
12021209
row_num = self.pos - (content_len - i + footers)
1203-
bad_lines.append((row_num, actual_len))
1204-
1210+
bad_lines.append((row_num, actual_len, "normal"))
12051211
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
12061212
break
12071213
else:
12081214
content.append(_content)
12091215

1210-
for row_num, actual_len in bad_lines:
1216+
for row_num, actual_len, source in bad_lines:
12111217
msg = (
12121218
f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
12131219
)
1214-
if (
1220+
if source == "callable":
1221+
msg += " from bad_lines callable"
1222+
elif (
12151223
self.delimiter
12161224
and len(self.delimiter) > 1
12171225
and self.quoting != csv.QUOTE_NONE

pandas/tests/io/parser/test_python_parser_only.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
432432
bad_sio = StringIO(data)
433433

434434
result = parser.read_csv_check_warnings(
435-
ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
435+
ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
436436
)
437437
expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
438438
tm.assert_frame_equal(result, expected)
@@ -562,3 +562,39 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
562562
expected = DataFrame(expected)
563563
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
564564
tm.assert_frame_equal(result, expected)
565+
566+
567+
@pytest.mark.parametrize("index_col", [None, 0])
568+
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569+
python_parser_only, index_col
570+
):
571+
# GH#61837
572+
parser = python_parser_only
573+
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
574+
575+
def fixer(bad_line):
576+
return list(bad_line) + ["EXTRA1", "EXTRA2"]
577+
578+
result = parser.read_csv_check_warnings(
579+
ParserWarning,
580+
"from bad_lines callable",
581+
StringIO(data),
582+
on_bad_lines=fixer,
583+
index_col=index_col,
584+
)
585+
586+
if index_col is None:
587+
expected = DataFrame(
588+
{
589+
"id": [101, 102, 103],
590+
"field_1": ["A", "C", "F"],
591+
"field_2": ["B", "D", "G"],
592+
}
593+
)
594+
else:
595+
expected = DataFrame(
596+
{"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
597+
index=Index([101, 102, 103], name="id"),
598+
)
599+
600+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)