BUG: always warn when on_bad_lines callable returns extra fields with index_col in read_csv (Python engine) (GH#61837) (#62297)

skalwaghe-56 · web-flow · commit 986b4e5d2ff4 · 2025-09-29T16:40:07.000-04:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1054,6 +1054,8 @@ MultiIndex
 I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
+- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
+  ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
 - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
 - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._typing import Scalar
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -77,7 +78,6 @@
         ArrayLike,
         DtypeObj,
         ReadCsvBuffer,
-        Scalar,
         T,
     )
 
@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
         """
         if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
             raise ParserError(msg)
-        if self.on_bad_lines == self.BadLineHandleMethod.WARN:
+        if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
+            self.on_bad_lines
+        ):
             warnings.warn(
                 f"Skipping line {row_num}: {msg}\n",
                 ParserWarning,
@@ -1189,29 +1191,35 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 
             for i, _content in iter_content:
                 actual_len = len(_content)
-
                 if actual_len > col_len:
                     if callable(self.on_bad_lines):
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
-                            content.append(new_l)  # pyright: ignore[reportArgumentType]
+                            new_l = cast(list[Scalar], new_l)
+                            if len(new_l) > col_len:
+                                row_num = self.pos - (content_len - i + footers)
+                                bad_lines.append((row_num, len(new_l), "callable"))
+                                new_l = new_l[:col_len]
+                            content.append(new_l)
+
                     elif self.on_bad_lines in (
                         self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
                     ):
                         row_num = self.pos - (content_len - i + footers)
-                        bad_lines.append((row_num, actual_len))
-
+                        bad_lines.append((row_num, actual_len, "normal"))
                         if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
                             break
                 else:
                     content.append(_content)
 
-            for row_num, actual_len in bad_lines:
+            for row_num, actual_len, source in bad_lines:
                 msg = (
                     f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
                 )
-                if (
+                if source == "callable":
+                    msg += " from bad_lines callable"
+                elif (
                     self.delimiter
                     and len(self.delimiter) > 1
                     and self.quoting != csv.QUOTE_NONE
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
     bad_sio = StringIO(data)
 
     result = parser.read_csv_check_warnings(
-        ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
+        ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
     )
     expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
     tm.assert_frame_equal(result, expected)
@@ -562,3 +562,39 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
     expected = DataFrame(expected)
     expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_col", [None, 0])
+def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
+    python_parser_only, index_col
+):
+    # GH#61837
+    parser = python_parser_only
+    data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
+
+    def fixer(bad_line):
+        return list(bad_line) + ["EXTRA1", "EXTRA2"]
+
+    result = parser.read_csv_check_warnings(
+        ParserWarning,
+        "from bad_lines callable",
+        StringIO(data),
+        on_bad_lines=fixer,
+        index_col=index_col,
+    )
+
+    if index_col is None:
+        expected = DataFrame(
+            {
+                "id": [101, 102, 103],
+                "field_1": ["A", "C", "F"],
+                "field_2": ["B", "D", "G"],
+            }
+        )
+    else:
+        expected = DataFrame(
+            {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
+            index=Index([101, 102, 103], name="id"),
+        )
+
+    tm.assert_frame_equal(result, expected)