resolve conflicts

pandas-dev · May 11, 2024 · 9f88ab3 · 9f88ab3
2 parents 6bbba80 + 4d9ffcf
commit 9f88ab3
Show file tree

Hide file tree

Showing 26 changed files with 285 additions and 1,636 deletions.
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -445,16 +445,6 @@ def setup(self, engine):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self, engine):
-        read_csv(
-            self.data(self.StringIO_input),
-            engine=engine,
-            sep=",",
-            header=None,
-            names=list(string.digits[:9]),
-            parse_dates=[[1, 2], [1, 3]],
-        )
-
     def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),

diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py
@@ -1,10 +1,5 @@
-import numpy as np
-
 try:
-    from pandas._libs.tslibs.parsing import (
-        _does_string_look_like_datetime,
-        concat_date_cols,
-    )
+    from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
 except ImportError:
     # Avoid whole benchmark suite import failure on asv (currently 0.4)
     pass
@@ -20,21 +15,3 @@ def setup(self, value):
     def time_check_datetimes(self, value):
         for obj in self.objects:
             _does_string_look_like_datetime(obj)
-
-
-class ConcatDateCols:
-    params = ([1234567890, "AAAA"], [1, 2])
-    param_names = ["value", "dim"]
-
-    def setup(self, value, dim):
-        count_elem = 10000
-        if dim == 1:
-            self.object = (np.array([value] * count_elem),)
-        if dim == 2:
-            self.object = (
-                np.array([value] * count_elem),
-                np.array([value] * count_elem),
-            )
-
-    def time_check_concat(self, value, dim):
-        concat_date_cols(self.object)
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -148,10 +148,14 @@ def time_searchsorted(self, dtype):
 
 
 class Map:
-    params = (["dict", "Series", "lambda"], ["object", "category", "int"])
-    param_names = "mapper"
-
-    def setup(self, mapper, dtype):
+    params = (
+        ["dict", "Series", "lambda"],
+        ["object", "category", "int"],
+        [None, "ignore"],
+    )
+    param_names = ["mapper", "dtype", "na_action"]
+
+    def setup(self, mapper, dtype, na_action):
         map_size = 1000
         map_data = Series(map_size - np.arange(map_size), dtype=dtype)
 
@@ -168,8 +172,8 @@ def setup(self, mapper, dtype):
 
         self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)
 
-    def time_map(self, mapper, *args, **kwargs):
-        self.s.map(self.map_data)
+    def time_map(self, mapper, dtype, na_action):
+        self.s.map(self.map_data, na_action=na_action)
 
 
 class Clip:

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -262,15 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
   * If ``True`` -> try parsing the index.
   * If ``[1, 2, 3]`` ->  try parsing columns 1, 2, 3 each as a separate date
     column.
-  * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
-    column.
-  * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.
 
   .. note::
      A fast-path exists for iso8601-formatted dates.
-keep_date_col : boolean, default ``False``
-  If ``True`` and parse_dates specifies combining multiple columns then keep the
-  original columns.
 date_format : str or dict of column -> format, default ``None``
    If used in conjunction with ``parse_dates``, will parse dates according to this
    format. For anything more complex,
@@ -802,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``:
 
 It is often the case that we may want to store date and time data separately,
 or store various date fields separately. the ``parse_dates`` keyword can be
-used to specify a combination of columns to parse the dates and/or times from.
-
-You can specify a list of column lists to ``parse_dates``, the resulting date
-columns will be prepended to the output (so as to not affect the existing column
-order) and the new column names will be the concatenation of the component
-column names:
-
-.. ipython:: python
-   :okwarning:
-
-   data = (
-       "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
-       "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
-       "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
-       "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
-       "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
-       "KORD,19990127, 23:00:00, 22:56:00, -0.5900"
-   )
-
-   with open("tmp.csv", "w") as fh:
-       fh.write(data)
-
-   df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
-   df
-
-By default the parser removes the component date columns, but you can choose
-to retain them via the ``keep_date_col`` keyword:
-
-.. ipython:: python
-   :okwarning:
-
-   df = pd.read_csv(
-       "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
-   )
-   df
+used to specify columns to parse the dates and/or times.
 
-Note that if you wish to combine multiple columns into a single date column, a
-nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
-the second and third columns should each be parsed as separate date columns
-while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
-single column.
-
-You can also use a dict to specify custom name columns:
-
-.. ipython:: python
-   :okwarning:
-
-   date_spec = {"nominal": [1, 2], "actual": [1, 3]}
-   df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
-   df
-
-It is important to remember that if multiple text columns are to be parsed into
-a single date column, then a new column is prepended to the data. The ``index_col``
-specification is based off of this new set of columns rather than the original
-data columns:
-
-
-.. ipython:: python
-   :okwarning:
-
-   date_spec = {"nominal": [1, 2], "actual": [1, 3]}
-   df = pd.read_csv(
-       "tmp.csv", header=None, parse_dates=date_spec, index_col=0
-   )  # index is the nominal column
-   df
 
 .. note::
    If a column or index contains an unparsable date, the entire column or
@@ -880,10 +811,6 @@ data columns:
    for your data to store datetimes in this format, load times will be
    significantly faster, ~20x has been observed.
 
-.. deprecated:: 2.2.0
-   Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
-   on the relevant result columns instead.
-
 
 Date parsing functions
 ++++++++++++++++++++++
@@ -899,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order:
    then use ``to_datetime``.
 
 
-.. ipython:: python
-   :suppress:
-
-   os.remove("tmp.csv")
-
-
 .. _io.csv.mixed_timezones:
 
 Parsing a CSV with mixed timezones

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -260,8 +260,10 @@ Removal of prior version deprecations/changes
 - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
 - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
 - Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
+- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
 - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
 - Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
+- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
 - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
 - Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
 - Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)
@@ -382,6 +384,8 @@ Datetimelike
 - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
 - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
 - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
+- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
+- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
 
 Timedelta
@@ -419,7 +423,6 @@ Interval
 Indexing
 ^^^^^^^^
 - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
-- Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
 -
 
 Missing

diff --git a/pandas/_libs/dtypes.pxd b/pandas/_libs/dtypes.pxd
@@ -34,8 +34,3 @@ ctypedef fused numeric_t:
 ctypedef fused numeric_object_t:
     numeric_t
     object
-
-ctypedef fused uint8_int64_object_t:
-    uint8_t
-    int64_t
-    object
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -53,6 +53,7 @@ from numpy cimport (
     PyArray_ITER_DATA,
     PyArray_ITER_NEXT,
     PyArray_IterNew,
+    PyArray_SETITEM,
     complex128_t,
     flatiter,
     float64_t,
@@ -75,7 +76,6 @@ cdef extern from "pandas/parser/pd_parser.h":
 PandasParser_IMPORT
 
 from pandas._libs cimport util
-from pandas._libs.dtypes cimport uint8_int64_object_t
 from pandas._libs.util cimport (
     INT64_MAX,
     INT64_MIN,
@@ -2845,14 +2845,16 @@ no_default = _NoDefault.no_default  # Sentinel indicating the default value.
 NoDefault = Literal[_NoDefault.no_default]
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def map_infer_mask(
-        ndarray[object] arr,
-        object f,
-        const uint8_t[:] mask,
-        *,
-        bint convert=True,
-        object na_value=no_default,
-        cnp.dtype dtype=np.dtype(object)
+    ndarray arr,
+    object f,
+    const uint8_t[:] mask,
+    *,
+    bint convert=True,
+    object na_value=no_default,
+    cnp.dtype dtype=np.dtype(object)
 ) -> "ArrayLike":
     """
     Substitute for np.vectorize with pandas-friendly dtype inference.
@@ -2875,53 +2877,39 @@ def map_infer_mask(
     -------
     np.ndarray or an ExtensionArray
     """
-    cdef Py_ssize_t n = len(arr)
-    result = np.empty(n, dtype=dtype)
-
-    _map_infer_mask(
-        result,
-        arr,
-        f,
-        mask,
-        na_value,
-    )
-    if convert:
-        return maybe_convert_objects(result)
-    else:
-        return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def _map_infer_mask(
-        ndarray[uint8_int64_object_t] out,
-        ndarray[object] arr,
-        object f,
-        const uint8_t[:] mask,
-        object na_value=no_default,
-) -> None:
-    """
-    Helper for map_infer_mask, split off to use fused types based on the result.
-    """
     cdef:
-        Py_ssize_t i, n
+        Py_ssize_t i
+        Py_ssize_t n = len(arr)
         object val
 
-    n = len(arr)
+        ndarray result = np.empty(n, dtype=dtype)
+
+        flatiter arr_it = PyArray_IterNew(arr)
+        flatiter result_it = PyArray_IterNew(result)
+
     for i in range(n):
         if mask[i]:
             if na_value is no_default:
-                val = arr[i]
+                val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
             else:
                 val = na_value
         else:
-            val = f(arr[i])
+            val = PyArray_GETITEM(arr, PyArray_ITER_DATA(arr_it))
+            val = f(val)
 
             if cnp.PyArray_IsZeroDim(val):
                 # unbox 0-dim arrays, GH#690
                 val = val.item()
 
-        out[i] = val
+        PyArray_SETITEM(result, PyArray_ITER_DATA(result_it), val)
+
+        PyArray_ITER_NEXT(arr_it)
+        PyArray_ITER_NEXT(result_it)
+
+    if convert:
+        return maybe_convert_objects(result)
+    else:
+        return result
 
 
 @cython.boundscheck(False)

diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -27,7 +27,4 @@ def guess_datetime_format(
     dt_str: str,
     dayfirst: bool | None = ...,
 ) -> str | None: ...
-def concat_date_cols(
-    date_cols: tuple,
-) -> npt.NDArray[np.object_]: ...
 def get_rule_month(source: str) -> str: ...