From 321aaa56f24d2b8ce6193b387f4f08d8dd6275cf Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 4 Dec 2022 11:22:28 -0500
Subject: [PATCH 1/7] BUG: read_csv with engine pyarrow parsing multiple date
 columns

---
 doc/source/whatsnew/v2.0.0.rst             |  2 +-
 pandas/io/parsers/arrow_parser_wrapper.py  |  2 +-
 pandas/io/parsers/base_parser.py           | 15 ++++++-----
 pandas/tests/io/parser/test_parse_dates.py | 31 +++++++++++++---------
 4 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index d8609737b8c7a..bc10df93716e6 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -745,7 +745,7 @@ I/O
 - Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
 - Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
 - Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
--
+- Fixed issue where :func:`read_csv` would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 68158a30f7fdf..184902e39a2a9 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -107,7 +107,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
                 multi_index_named = False
             frame.columns = self.names
         # we only need the frame not the names
-        frame.columns, frame = self._do_date_conversions(frame.columns, frame)
+        _, frame = self._do_date_conversions(frame.columns, frame)
         if self.index_col is not None:
             for i, item in enumerate(self.index_col):
                 if is_integer(item):
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index c5fc054952b1f..56b700a7dfce6 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -7,7 +7,6 @@
 from enum import Enum
 import itertools
 from typing import (
-    TYPE_CHECKING,
     Any,
     Callable,
     DefaultDict,
@@ -71,7 +70,11 @@
 )
 from pandas.core.dtypes.missing import isna
 
-from pandas import StringDtype
+from pandas import (
+    DataFrame,
+    StringDtype,
+    concat,
+)
 from pandas.core import algorithms
 from pandas.core.arrays import (
     ArrowExtensionArray,
@@ -89,9 +92,6 @@
 from pandas.core.series import Series
 from pandas.core.tools import datetimes as tools
 
-if TYPE_CHECKING:
-    from pandas import DataFrame
-
 
 class ParserBase:
     class BadLineHandleMethod(Enum):
@@ -1264,7 +1264,10 @@ def _isindex(colspec):
             new_cols.append(new_name)
             date_cols.update(old_names)
 
-    data_dict.update(new_data)
+    if isinstance(data_dict, DataFrame):
+        data_dict = concat([DataFrame(new_data), data_dict], axis=1)
+    else:
+        data_dict.update(new_data)
     new_cols.extend(columns)
 
     if not keep_date_col:
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 1a8149ae41fcb..afd02630c6bcc 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -138,9 +138,8 @@ def test_separator_date_conflict(all_parsers):
     tm.assert_frame_equal(df, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col_custom(all_parsers, keep_date_col):
+def test_multiple_date_col_custom(all_parsers, keep_date_col, request):
     data = """\
 KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -151,6 +150,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col):
 """
     parser = all_parsers
 
+    if keep_date_col and parser.engine == "pyarrow":
+        # For this to pass, we need to disable auto-inference on the date columns
+        # in parse_dates. We have no way of doing this though
+        mark = pytest.mark.xfail(
+            reason="pyarrow doesn't support disabling auto-inference on column numbers."
+        )
+        request.node.add_marker(mark)
+
     def date_parser(*date_cols):
         """
         Test date parser.
@@ -293,9 +300,8 @@ def test_concat_date_col_fail(container, dim):
         parsing.concat_date_cols(date_cols)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize("keep_date_col", [True, False])
-def test_multiple_date_col(all_parsers, keep_date_col):
+def test_multiple_date_col(all_parsers, keep_date_col, request):
     data = """\
 KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
@@ -305,6 +311,15 @@ def test_multiple_date_col(all_parsers, keep_date_col):
 KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
 """
     parser = all_parsers
+
+    if keep_date_col and parser.engine == "pyarrow":
+        # For this to pass, we need to disable auto-inference on the date columns
+        # in parse_dates. We have no way of doing this though
+        mark = pytest.mark.xfail(
+            reason="pyarrow doesn't support disabling auto-inference on column numbers."
+        )
+        request.node.add_marker(mark)
+
     kwds = {
         "header": None,
         "parse_dates": [[1, 2], [1, 3]],
@@ -461,7 +476,6 @@ def test_date_col_as_index_col(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_multiple_date_cols_int_cast(all_parsers):
     data = (
         "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
@@ -1132,7 +1146,6 @@ def test_multiple_date_cols_chunked(all_parsers):
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
-@xfail_pyarrow
 def test_multiple_date_col_named_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1156,7 +1169,6 @@ def test_multiple_date_col_named_index_compat(all_parsers):
     tm.assert_frame_equal(with_indices, with_names)
 
 
-@xfail_pyarrow
 def test_multiple_date_col_multiple_index_compat(all_parsers):
     parser = all_parsers
     data = """\
@@ -1301,7 +1313,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 @pytest.mark.parametrize(
     "data,kwargs,expected",
     [
@@ -1385,7 +1396,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_parse_date_fields(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
@@ -1403,7 +1413,6 @@ def test_parse_date_fields(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_parse_date_all_fields(all_parsers):
     parser = all_parsers
     data = """\
@@ -1427,7 +1436,6 @@ def test_parse_date_all_fields(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_datetime_fractional_seconds(all_parsers):
     parser = all_parsers
     data = """\
@@ -1451,7 +1459,6 @@ def test_datetime_fractional_seconds(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_generic(all_parsers):
     parser = all_parsers
     data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."

From 062426bf829f1a388d89590968af356b14209f31 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 9 Apr 2023 22:07:34 -0400
Subject: [PATCH 2/7] Update v2.1.0.rst

---
 doc/source/whatsnew/v2.1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 9f5d6011a7780..0d3f6b2140d06 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -276,7 +276,7 @@ I/O
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
 - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
--
+- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 
 Period
 ^^^^^^

From aad1c552965bdbfbc50f719676f286e4eb6ab6f8 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 17 Apr 2023 21:44:45 -0400
Subject: [PATCH 3/7] bad merge

---
 pandas/io/parsers/base_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 444c920365426..14ad8562e75e9 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -7,6 +7,7 @@
 from enum import Enum
 import itertools
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Hashable,

From 3e41179d2f6cfe813f3dd2234e7b8c95af517c93 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 23 Apr 2023 08:27:19 -0400
Subject: [PATCH 4/7] fix?

---
 pandas/io/parsers/arrow_parser_wrapper.py  | 16 ++++++++++++++++
 pandas/io/parsers/base_parser.py           |  4 ++--
 pandas/tests/io/parser/test_parse_dates.py |  1 -
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
index 04f414ab8ce24..7802fe6fd63c6 100644
--- a/pandas/io/parsers/arrow_parser_wrapper.py
+++ b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -61,6 +61,21 @@ def _get_pyarrow_options(self) -> None:
             if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
                 self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)
 
+        # Date format handling
+        # If we get a string, we need to convert it into a list for pyarrow
+        # If we get a dict, we want to parse those separately
+        date_format = self.date_format
+        if isinstance(date_format, str):
+            date_format = [date_format]
+        else:
+            # In case of dict, we don't want to propagate through, so
+            # just set to pyarrow default of None
+
+            # Ideally, in future we disable pyarrow dtype inference (read in as string)
+            # to prevent misreads.
+            date_format = None
+        self.kwds["timestamp_parsers"] = date_format
+
         self.parse_options = {
             option_name: option_value
             for option_name, option_value in self.kwds.items()
@@ -79,6 +94,7 @@ def _get_pyarrow_options(self) -> None:
                 "true_values",
                 "false_values",
                 "decimal_point",
+                "timestamp_parsers",
             )
         }
         self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 9e31858291f01..e4add53983629 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -62,8 +62,10 @@
 
 from pandas import (
     ArrowDtype,
+    DataFrame,
     DatetimeIndex,
     StringDtype,
+    concat,
 )
 from pandas.core import algorithms
 from pandas.core.arrays import (
@@ -93,8 +95,6 @@
         Scalar,
     )
 
-    from pandas import DataFrame
-
 
 class ParserBase:
     class BadLineHandleMethod(Enum):
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index 70caa0f59fc88..b217b531373a6 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -544,7 +544,6 @@ def test_multiple_date_cols_int_cast(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_multiple_date_col_timestamp_parse(all_parsers):
     parser = all_parsers
     data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25

From 077abfdd1b8680181ab1bd6e167e020a06c5e846 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sun, 23 Apr 2023 21:43:44 -0400
Subject: [PATCH 5/7] adjust test

---
 pandas/tests/io/parser/test_parse_dates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index b217b531373a6..c8637962b6ab3 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -1519,7 +1519,7 @@ def test_parse_date_fields(all_parsers):
         StringIO(data),
         header=0,
         parse_dates={"ymd": [0, 1, 2]},
-        date_parser=pd.to_datetime,
+        date_parser=lambda x: x,
     )
 
     expected = DataFrame(

From 3347dc9259fda26bd7b6ac326478b7a41a17cec3 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 24 Apr 2023 07:48:41 -0400
Subject: [PATCH 6/7] Update v2.1.0.rst

---
 doc/source/whatsnew/v2.1.0.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 2a34564a021dd..ce01bf65ec6c9 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -354,9 +354,9 @@ I/O
 ^^^
 - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
 - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
+- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
 - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
-- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
 -
 

From 5cbe608673cd0f2175da98e6dc61f98445619b76 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Sat, 29 Apr 2023 13:03:05 -0400
Subject: [PATCH 7/7] lazy copy in concat

---
 pandas/io/parsers/base_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
index 4c4655a551201..c9c839047cc19 100644
--- a/pandas/io/parsers/base_parser.py
+++ b/pandas/io/parsers/base_parser.py
@@ -1284,7 +1284,7 @@ def _isindex(colspec):
             date_cols.update(old_names)
 
     if isinstance(data_dict, DataFrame):
-        data_dict = concat([DataFrame(new_data), data_dict], axis=1)
+        data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False)
     else:
         data_dict.update(new_data)
     new_cols.extend(columns)