From 321aaa56f24d2b8ce6193b387f4f08d8dd6275cf Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 4 Dec 2022 11:22:28 -0500 Subject: [PATCH 1/7] BUG: read_csv with engine pyarrow parsing multiple date columns --- doc/source/whatsnew/v2.0.0.rst | 2 +- pandas/io/parsers/arrow_parser_wrapper.py | 2 +- pandas/io/parsers/base_parser.py | 15 ++++++----- pandas/tests/io/parser/test_parse_dates.py | 31 +++++++++++++--------- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d8609737b8c7a..bc10df93716e6 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -745,7 +745,7 @@ I/O - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`) - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`) - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`) -- +- Fixed issue where :func:`read_csv` would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) Period ^^^^^^ diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 68158a30f7fdf..184902e39a2a9 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -107,7 +107,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: multi_index_named = False frame.columns = self.names # we only need the frame not the names - frame.columns, frame = self._do_date_conversions(frame.columns, frame) + _, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c5fc054952b1f..56b700a7dfce6 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -7,7 +7,6 @@ from enum import Enum import itertools from typing import ( - TYPE_CHECKING, Any, Callable, DefaultDict, @@ -71,7 +70,11 @@ ) from pandas.core.dtypes.missing import isna -from pandas import StringDtype +from pandas import ( + DataFrame, + StringDtype, + concat, +) from pandas.core import algorithms from pandas.core.arrays import ( ArrowExtensionArray, @@ -89,9 +92,6 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -if TYPE_CHECKING: - from pandas import DataFrame - class ParserBase: class BadLineHandleMethod(Enum): @@ -1264,7 +1264,10 @@ def _isindex(colspec): new_cols.append(new_name) date_cols.update(old_names) - data_dict.update(new_data) + if isinstance(data_dict, DataFrame): + data_dict = concat([DataFrame(new_data), data_dict], axis=1) + else: + data_dict.update(new_data) new_cols.extend(columns) if not keep_date_col: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 1a8149ae41fcb..afd02630c6bcc 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -138,9 +138,8 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col): +def test_multiple_date_col_custom(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 @@ -151,6 +150,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col): """ parser = all_parsers + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.node.add_marker(mark) + def date_parser(*date_cols): """ Test date parser. @@ -293,9 +300,8 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) -@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col): +def test_multiple_date_col(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 @@ -305,6 +311,15 @@ def test_multiple_date_col(all_parsers, keep_date_col): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers + + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.node.add_marker(mark) + kwds = { "header": None, "parse_dates": [[1, 2], [1, 3]], @@ -461,7 +476,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_multiple_date_cols_int_cast(all_parsers): data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -1132,7 +1146,6 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) -@xfail_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1156,7 +1169,6 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) -@xfail_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1301,7 +1313,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1385,7 +1396,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_parse_date_fields(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1403,7 +1413,6 @@ def test_parse_date_fields(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_parse_date_all_fields(all_parsers): parser = all_parsers data = """\ @@ -1427,7 +1436,6 @@ def test_parse_date_all_fields(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_datetime_fractional_seconds(all_parsers): parser = all_parsers data = """\ @@ -1451,7 +1459,6 @@ def test_datetime_fractional_seconds(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." From 062426bf829f1a388d89590968af356b14209f31 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 9 Apr 2023 22:07:34 -0400 Subject: [PATCH 2/7] Update v2.1.0.rst --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 9f5d6011a7780..0d3f6b2140d06 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -276,7 +276,7 @@ I/O - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) -- +- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) Period ^^^^^^ From aad1c552965bdbfbc50f719676f286e4eb6ab6f8 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 17 Apr 2023 21:44:45 -0400 Subject: [PATCH 3/7] bad merge --- pandas/io/parsers/base_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 444c920365426..14ad8562e75e9 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -7,6 +7,7 @@ from enum import Enum import itertools from typing import ( + TYPE_CHECKING, Any, Callable, Hashable, From 3e41179d2f6cfe813f3dd2234e7b8c95af517c93 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 23 Apr 2023 08:27:19 -0400 Subject: [PATCH 4/7] fix? --- pandas/io/parsers/arrow_parser_wrapper.py | 16 ++++++++++++++++ pandas/io/parsers/base_parser.py | 4 ++-- pandas/tests/io/parser/test_parse_dates.py | 1 - 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 04f414ab8ce24..7802fe6fd63c6 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -61,6 +61,21 @@ def _get_pyarrow_options(self) -> None: if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + # Date format handling + # If we get a string, we need to convert it into a list for pyarrow + # If we get a dict, we want to parse those separately + date_format = self.date_format + if isinstance(date_format, str): + date_format = [date_format] + else: + # In case of dict, we don't want to propagate through, so + # just set to pyarrow default of None + + # Ideally, in future we disable pyarrow dtype inference (read in as string) + # to prevent misreads. + date_format = None + self.kwds["timestamp_parsers"] = date_format + self.parse_options = { option_name: option_value for option_name, option_value in self.kwds.items() @@ -79,6 +94,7 @@ def _get_pyarrow_options(self) -> None: "true_values", "false_values", "decimal_point", + "timestamp_parsers", ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 9e31858291f01..e4add53983629 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -62,8 +62,10 @@ from pandas import ( ArrowDtype, + DataFrame, DatetimeIndex, StringDtype, + concat, ) from pandas.core import algorithms from pandas.core.arrays import ( @@ -93,8 +95,6 @@ Scalar, ) - from pandas import DataFrame - class ParserBase: class BadLineHandleMethod(Enum): diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 70caa0f59fc88..b217b531373a6 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -544,7 +544,6 @@ def test_multiple_date_cols_int_cast(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 From 077abfdd1b8680181ab1bd6e167e020a06c5e846 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 23 Apr 2023 21:43:44 -0400 Subject: [PATCH 5/7] adjust test --- pandas/tests/io/parser/test_parse_dates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index b217b531373a6..c8637962b6ab3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1519,7 +1519,7 @@ def test_parse_date_fields(all_parsers): StringIO(data), header=0, parse_dates={"ymd": [0, 1, 2]}, - date_parser=pd.to_datetime, + date_parser=lambda x: x, ) expected = DataFrame( From 3347dc9259fda26bd7b6ac326478b7a41a17cec3 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Mon, 24 Apr 2023 07:48:41 -0400 Subject: [PATCH 6/7] Update v2.1.0.rst --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2a34564a021dd..ce01bf65ec6c9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -354,9 +354,9 @@ I/O ^^^ - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) +- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) -- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) - From 5cbe608673cd0f2175da98e6dc61f98445619b76 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sat, 29 Apr 2023 13:03:05 -0400 Subject: [PATCH 7/7] lazy copy in concat --- pandas/io/parsers/base_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 4c4655a551201..c9c839047cc19 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -1284,7 +1284,7 @@ def _isindex(colspec): date_cols.update(old_names) if isinstance(data_dict, DataFrame): - data_dict = concat([DataFrame(new_data), data_dict], axis=1) + data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False) else: data_dict.update(new_data) new_cols.extend(columns)