From 2e3b0b3822ec3bc93f7f8cff82f03c859d9c302e Mon Sep 17 00:00:00 2001 From: u7397058 Date: Sun, 26 Oct 2025 14:35:28 +1100 Subject: [PATCH 01/11] Fixing make_reader so that it skip rows at file level --- pandas/io/parsers/python_parser.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index dc7a21c859a33..e5b9375214516 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -218,6 +218,15 @@ class MyDialect(csv.Dialect): if sep is not None: dia.delimiter = sep + # Skip rows at file level before csv.reader sees them + # prevents CSV parsing errors on lines that will be discarded + if self.skiprows is not None: + while self.skipfunc(self.pos): + self.pos += 1 + try: + f.readline() + except (StopIteration, AttributeError): + break else: # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows @@ -907,7 +916,12 @@ def _next_line(self) -> list[Scalar]: else: while self.skipfunc(self.pos): self.pos += 1 - next(self.data) + try: + next(self.data) + except csv.Error: + # CSV parsing error on a skipped line is acceptable + # The line is being discarded without using its content + pass while True: orig_line = self._next_iter_line(row_num=self.pos + 1) @@ -926,7 +940,7 @@ def _next_line(self) -> list[Scalar]: break # This was the first line of the file, - # which could contain the BOM at the + # which could contain the BOM at theo # beginning of it. if self.pos == 1: line = self._check_for_bom(line) @@ -1494,7 +1508,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: + def _emake_rader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: return FixedWidthReader( f, self.colspecs, From ec55b63d823549744772328d42e678cb26124bf2 Mon Sep 17 00:00:00 2001 From: u7397058 Date: Sun, 26 Oct 2025 14:35:45 +1100 Subject: [PATCH 02/11] Added tests --- .../io/parser/test_python_parser_only.py | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 0de65ab889be8..55c950552512d 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -599,3 +599,46 @@ def fixer(bad_line): ) tm.assert_frame_equal(result, expected) + + +def test_read_csv_leading_quote_skip(python_parser_only): + # GH 62739 + tbl = """\ + " +a b +1 3 +""" + parser = python_parser_only + result = parser.read_csv( + StringIO(tbl), + delimiter=" ", + skiprows=1, + ) + expected = DataFrame({"a": [1], "b": [3]}) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unclosed_double_quote_in_data_still_errors(python_parser_only): + # GH 62739 + tbl = """\ +comment line +a b +" +1 3 +""" + parser = python_parser_only + with pytest.raises(ParserError, match="unexpected end of data"): + parser.read_csv(StringIO(tbl), delimiter=" ", skiprows=1) + + +def test_read_csv_skiprows_zero(python_parser_only): + # GH 62739 + tbl = """\ +" +a b +1 3 +""" + parser = python_parser_only + # don't skip anything + with pytest.raises(ParserError, match="unexpected end of data"): + parser.read_csv(StringIO(tbl), delimiter=" ", skiprows=0, engine="python") From 82beb0d5c8157b66fe40294dfafb8f58809f5020 Mon Sep 17 00:00:00 2001 From: Georgina Scott <154103190+georginas05@users.noreply.github.com> Date: Sun, 26 Oct 2025 15:21:10 +1100 Subject: [PATCH 03/11] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 44bc82008e718..cf6fd727c1317 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1104,6 +1104,7 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) +- Bug in :meth: `python_parser` where :class: `MyDialect` did not appropriately skip a line when instructed, causing `EmptyDataError` (:issue: `62739`) Period ^^^^^^ From c6cf7f6b81cf0d4460fd78092194225ac90e0b66 Mon Sep 17 00:00:00 2001 From: Georgina Scott <154103190+georginas05@users.noreply.github.com> Date: Sun, 26 Oct 2025 15:31:21 +1100 Subject: [PATCH 04/11] Update v3.0.0.rst minor changes --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cf6fd727c1317..b9db3b6f1d831 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1104,7 +1104,7 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :meth: `python_parser` where :class: `MyDialect` did not appropriately skip a line when instructed, causing `EmptyDataError` (:issue: `62739`) +- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) Period ^^^^^^ From 6a3e5ad7b066d4a4306afd998610ac5ef3c3b886 Mon Sep 17 00:00:00 2001 From: Georgina Scott <154103190+georginas05@users.noreply.github.com> Date: Sun, 26 Oct 2025 15:39:26 +1100 Subject: [PATCH 05/11] Update v3.0.0.rst - alphabetical order --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index b9db3b6f1d831..32633fbc2159e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1086,6 +1086,7 @@ I/O - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) +- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) @@ -1104,7 +1105,6 @@ I/O - Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`) - Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`) - Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`) -- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) Period ^^^^^^ From 048bebaef3584ae58730394b10f52ef0ac811611 Mon Sep 17 00:00:00 2001 From: zephyrieal Date: Sun, 26 Oct 2025 18:24:31 +1100 Subject: [PATCH 06/11] Fixed accidental typo in code --- pandas/io/parsers/python_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e5b9375214516..1791b8a3d221d 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -940,7 +940,7 @@ def _next_line(self) -> list[Scalar]: break # This was the first line of the file, - # which could contain the BOM at theo + # which could contain the BOM at the # beginning of it. if self.pos == 1: line = self._check_for_bom(line) @@ -1508,7 +1508,7 @@ def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None: self.infer_nrows = kwds.pop("infer_nrows") PythonParser.__init__(self, f, **kwds) - def _emake_rader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: + def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> FixedWidthReader: return FixedWidthReader( f, self.colspecs, From 9f5046c0541f6233a75c4712e277de38aec79dc7 Mon Sep 17 00:00:00 2001 From: zephyrieal Date: Sun, 26 Oct 2025 18:26:09 +1100 Subject: [PATCH 07/11] removed residue code --- pandas/io/parsers/python_parser.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 1791b8a3d221d..0a95f5e9bc9f6 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -916,12 +916,7 @@ def _next_line(self) -> list[Scalar]: else: while self.skipfunc(self.pos): self.pos += 1 - try: - next(self.data) - except csv.Error: - # CSV parsing error on a skipped line is acceptable - # The line is being discarded without using its content - pass + next(self.data) while True: orig_line = self._next_iter_line(row_num=self.pos + 1) From 2d222cff5530f7ed5575078069bfe56f3b42de45 Mon Sep 17 00:00:00 2001 From: zephyrieal Date: Mon, 27 Oct 2025 02:57:21 +1100 Subject: [PATCH 08/11] break if end of line is reached --- pandas/io/parsers/python_parser.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 0a95f5e9bc9f6..b08e013d7acb8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -222,11 +222,10 @@ class MyDialect(csv.Dialect): # prevents CSV parsing errors on lines that will be discarded if self.skiprows is not None: while self.skipfunc(self.pos): - self.pos += 1 - try: - f.readline() - except (StopIteration, AttributeError): + line = f.readline() + if not line: break + self.pos += 1 else: # attempt to sniff the delimiter from the first valid line, # i.e. no comment line and not in skiprows From 820fa85845de25dc5859e2c19be9793bd4c5f388 Mon Sep 17 00:00:00 2001 From: zephyrieal Date: Mon, 27 Oct 2025 02:57:59 +1100 Subject: [PATCH 09/11] removed typo --- pandas/tests/io/parser/test_python_parser_only.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 55c950552512d..ee6484709be63 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -621,7 +621,6 @@ def test_read_csv_leading_quote_skip(python_parser_only): def test_read_csv_unclosed_double_quote_in_data_still_errors(python_parser_only): # GH 62739 tbl = """\ -comment line a b " 1 3 From 79c0737df1e17cdfedd9eaef7a279b2b2b3a8131 Mon Sep 17 00:00:00 2001 From: Zephy <74450862+zephyrieal@users.noreply.github.com> Date: Sun, 2 Nov 2025 15:41:22 +1100 Subject: [PATCH 10/11] Update v3.0.0.rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f44b63bd55563..f37a7c217ef7a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1106,7 +1106,7 @@ I/O - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) -- Bug in :meth:`python_parser` where :class:`MyDialect` did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) +- Bug in :func:`read_csv` where it did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) From 9deae3035dbf9b32231b3e682cecb3152651bce8 Mon Sep 17 00:00:00 2001 From: Zephy <74450862+zephyrieal@users.noreply.github.com> Date: Sun, 2 Nov 2025 15:53:09 +1100 Subject: [PATCH 11/11] Update v3.0.0.rst - fix ordering --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3e8c9e5c88ae2..d3b67af4b3061 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1116,11 +1116,11 @@ I/O - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`) - Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`) - Bug in :meth:`MultiIndex.factorize` incorrectly raising on length-0 indexes (:issue:`57517`) -- Bug in :func:`read_csv` where it did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` for the ``c`` and ``python`` engines where parsing numbers with large exponents caused overflows. Now, numbers with large positive exponents are parsed as ``inf`` or ``-inf`` depending on the sign of the mantissa, while those with large negative exponents are parsed as ``0.0`` (:issue:`62617`, :issue:`38794`, :issue:`62740`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_csv` where it did not appropriately skip a line when instructed, causing Empty Data Error (:issue:`62739`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``c`` and ``python`` engines reading big integers as strings. Now reads them as python integers. (:issue:`51295`) - Bug in :meth:`read_csv` with ``engine="c"`` reading large float numbers with preceding integers as strings. Now reads them as floats. (:issue:`51295`)