DEPR: stop inferring dt64/td64 from strings in Series construtor (#49319

) * DEPR: stop inferring dt64/td64 from strings in Series construtor * update pyi
pandas-dev · Oct 26, 2022 · 6ee0acb · 6ee0acb
1 parent 218ab09
commit 6ee0acb
Show file tree

Hide file tree

Showing 12 changed files with 72 additions and 116 deletions.
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -238,6 +238,7 @@ Removal of prior version deprecations/changes
 - Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
 - Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
 - Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
+- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
 - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
 
 .. ---------------------------------------------------------------------------

diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -158,7 +158,7 @@ def ensure_string_array(
 ) -> npt.NDArray[np.object_]: ...
 def infer_datetimelike_array(
     arr: npt.NDArray[np.object_],
-) -> tuple[str, bool]: ...
+) -> str: ...
 def convert_nans_to_NA(
     arr: npt.NDArray[np.object_],
 ) -> npt.NDArray[np.object_]: ...

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -95,7 +95,6 @@ from pandas._libs.util cimport (
     is_nan,
 )
 
-from pandas._libs.tslib import array_to_datetime
 from pandas._libs.tslibs import (
     OutOfBoundsDatetime,
     OutOfBoundsTimedelta,
@@ -1583,25 +1582,19 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
     Returns
     -------
     str: {datetime, timedelta, date, nat, mixed}
-    bool
     """
     cdef:
         Py_ssize_t i, n = len(arr)
         bint seen_timedelta = False, seen_date = False, seen_datetime = False
         bint seen_tz_aware = False, seen_tz_naive = False
-        bint seen_nat = False, seen_str = False
+        bint seen_nat = False
         bint seen_period = False, seen_interval = False
-        list objs = []
         object v
 
     for i in range(n):
         v = arr[i]
         if isinstance(v, str):
-            objs.append(v)
-            seen_str = True
-
-            if len(objs) == 3:
-                break
+            return "mixed"
 
         elif v is None or util.is_nan(v):
             # nan or None
@@ -1619,7 +1612,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
                 seen_tz_aware = True
 
             if seen_tz_naive and seen_tz_aware:
-                return "mixed", seen_str
+                return "mixed"
         elif util.is_datetime64_object(v):
             # np.datetime64
             seen_datetime = True
@@ -1635,43 +1628,30 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
             seen_interval = True
             break
         else:
-            return "mixed", seen_str
+            return "mixed"
 
     if seen_period:
         if is_period_array(arr):
-            return "period", seen_str
-        return "mixed", seen_str
+            return "period"
+        return "mixed"
 
     if seen_interval:
         if is_interval_array(arr):
-            return "interval", seen_str
-        return "mixed", seen_str
+            return "interval"
+        return "mixed"
 
     if seen_date and not (seen_datetime or seen_timedelta):
-        return "date", seen_str
+        return "date"
     elif seen_datetime and not seen_timedelta:
-        return "datetime", seen_str
+        return "datetime"
     elif seen_timedelta and not seen_datetime:
-        return "timedelta", seen_str
+        return "timedelta"
+    elif seen_datetime and seen_timedelta:
+        return "mixed"
     elif seen_nat:
-        return "nat", seen_str
+        return "nat"
 
-    # short-circuit by trying to
-    # actually convert these strings
-    # this is for performance as we don't need to try
-    # convert *every* string array
-    if len(objs):
-        try:
-            # require_iso8601 as in maybe_infer_to_datetimelike
-            array_to_datetime(objs, errors="raise", require_iso8601=True)
-            return "datetime", seen_str
-        except (ValueError, TypeError):
-            pass
-
-        # we are *not* going to infer from strings
-        # for timedelta as too much ambiguity
-
-    return "mixed", seen_str
+    return "mixed"
 
 
 cdef inline bint is_timedelta(object o):

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1264,7 +1264,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
         else:
             return td_values.reshape(shape)
 
-    inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
+    # TODO: can we just do lib.maybe_convert_objects for this entire function?
+    inferred_type = lib.infer_datetimelike_array(ensure_object(v))
+
     if inferred_type in ["period", "interval"]:
         # Incompatible return value type (got "Union[ExtensionArray, ndarray]",
         # expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,
@@ -1280,14 +1282,14 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
     elif inferred_type == "timedelta":
         value = try_timedelta(v)
     elif inferred_type == "nat":
+        # only reached if we have at least 1 NaT and the rest (NaT or None or np.nan)
 
         # if all NaT, return as datetime
         if isna(v).all():
             # error: Incompatible types in assignment (expression has type
             # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
             value = try_datetime(v)  # type: ignore[assignment]
         else:
-
             # We have at least a NaT and a string
             # try timedelta first to avoid spurious datetime conversions
             # e.g. '00:00:01' is a timedelta but technically is also a datetime
@@ -1300,15 +1302,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
                 # "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
                 value = try_datetime(v)  # type: ignore[assignment]
 
-    if value.dtype.kind in ["m", "M"] and seen_str:
-        # TODO(2.0): enforcing this deprecation should close GH#40111
-        warnings.warn(
-            f"Inferring {value.dtype} from data containing strings is deprecated "
-            "and will be removed in a future version. To retain the old behavior "
-            f"explicitly pass Series(data, dtype={value.dtype})",
-            FutureWarning,
-            stacklevel=find_stack_level(),
-        )
     return value
 
 

diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -775,7 +775,7 @@ def _infer_types(
                 result = BooleanArray(result, bool_mask)
             elif result.dtype == np.object_ and use_nullable_dtypes:
                 # read_excel sends array of datetime objects
-                inferred_type, _ = lib.infer_datetimelike_array(result)
+                inferred_type = lib.infer_datetimelike_array(result)
                 if inferred_type != "datetime":
                     result = StringDtype().construct_array_type()._from_sequence(values)
 

diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
@@ -859,8 +859,7 @@ def test_apply_to_timedelta():
     list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
 
     a = pd.to_timedelta(list_of_strings)
-    with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
-        ser = Series(list_of_strings)
+    ser = Series(list_of_strings)
     b = ser.apply(pd.to_timedelta)
     tm.assert_series_equal(Series(a), b)
 

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -1346,7 +1346,7 @@ def test_infer_dtype_period_with_na(self, na_value):
         ],
     )
     def test_infer_datetimelike_array_datetime(self, data):
-        assert lib.infer_datetimelike_array(data) == ("datetime", False)
+        assert lib.infer_datetimelike_array(data) == "datetime"
 
     @pytest.mark.parametrize(
         "data",
@@ -1358,11 +1358,11 @@ def test_infer_datetimelike_array_datetime(self, data):
         ],
     )
     def test_infer_datetimelike_array_timedelta(self, data):
-        assert lib.infer_datetimelike_array(data) == ("timedelta", False)
+        assert lib.infer_datetimelike_array(data) == "timedelta"
 
     def test_infer_datetimelike_array_date(self):
         arr = [date(2017, 6, 12), date(2017, 3, 11)]
-        assert lib.infer_datetimelike_array(arr) == ("date", False)
+        assert lib.infer_datetimelike_array(arr) == "date"
 
     @pytest.mark.parametrize(
         "data",
@@ -1377,7 +1377,7 @@ def test_infer_datetimelike_array_date(self):
         ],
     )
     def test_infer_datetimelike_array_mixed(self, data):
-        assert lib.infer_datetimelike_array(data)[0] == "mixed"
+        assert lib.infer_datetimelike_array(data) == "mixed"
 
     @pytest.mark.parametrize(
         "first, expected",
@@ -1395,7 +1395,7 @@ def test_infer_datetimelike_array_mixed(self, data):
     @pytest.mark.parametrize("second", [None, np.nan])
     def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
         first.append(second)
-        assert lib.infer_datetimelike_array(first) == (expected, False)
+        assert lib.infer_datetimelike_array(first) == expected
 
     def test_infer_dtype_all_nan_nat_like(self):
         arr = np.array([np.nan, np.nan])

diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
@@ -321,29 +321,27 @@ def test_groupby_resample_interpolate():
         .interpolate(method="linear")
     )
 
-    msg = "containing strings is deprecated"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        expected_ind = pd.MultiIndex.from_tuples(
-            [
-                (50, "2018-01-07"),
-                (50, Timestamp("2018-01-08")),
-                (50, Timestamp("2018-01-09")),
-                (50, Timestamp("2018-01-10")),
-                (50, Timestamp("2018-01-11")),
-                (50, Timestamp("2018-01-12")),
-                (50, Timestamp("2018-01-13")),
-                (50, Timestamp("2018-01-14")),
-                (50, Timestamp("2018-01-15")),
-                (50, Timestamp("2018-01-16")),
-                (50, Timestamp("2018-01-17")),
-                (50, Timestamp("2018-01-18")),
-                (50, Timestamp("2018-01-19")),
-                (50, Timestamp("2018-01-20")),
-                (50, Timestamp("2018-01-21")),
-                (60, Timestamp("2018-01-14")),
-            ],
-            names=["volume", "week_starting"],
-        )
+    expected_ind = pd.MultiIndex.from_tuples(
+        [
+            (50, Timestamp("2018-01-07")),
+            (50, Timestamp("2018-01-08")),
+            (50, Timestamp("2018-01-09")),
+            (50, Timestamp("2018-01-10")),
+            (50, Timestamp("2018-01-11")),
+            (50, Timestamp("2018-01-12")),
+            (50, Timestamp("2018-01-13")),
+            (50, Timestamp("2018-01-14")),
+            (50, Timestamp("2018-01-15")),
+            (50, Timestamp("2018-01-16")),
+            (50, Timestamp("2018-01-17")),
+            (50, Timestamp("2018-01-18")),
+            (50, Timestamp("2018-01-19")),
+            (50, Timestamp("2018-01-20")),
+            (50, Timestamp("2018-01-21")),
+            (60, Timestamp("2018-01-14")),
+        ],
+        names=["volume", "week_starting"],
+    )
 
     expected = DataFrame(
         data={

diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py
@@ -79,9 +79,7 @@ def test_combine_first_dt64(self):
         s1 = Series([np.NaN, "2011"])
         rs = s0.combine_first(s1)
 
-        msg = "containing strings is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            xp = Series([datetime(2010, 1, 1), "2011"])
+        xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]")
 
         tm.assert_series_equal(rs, xp)
 

diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py
@@ -365,10 +365,7 @@ def test_datetime64_fillna(self):
     def test_datetime64_fillna_backfill(self):
         # GH#6587
         # make sure that we are treating as integer when filling
-        msg = "containing strings is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            # this also tests inference of a datetime-like with NaT's
-            ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
+        ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]")
 
         expected = Series(
             [

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -1018,24 +1018,20 @@ def test_constructor_dtype_datetime64_7(self):
         assert series1.dtype == object
 
     def test_constructor_dtype_datetime64_6(self):
-        # these will correctly infer a datetime
-        msg = "containing strings is deprecated"
+        # as of 2.0, these no longer infer datetime64 based on the strings,
+        #  matching the Index behavior
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
-        assert ser.dtype == "datetime64[ns]"
+        ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
-        assert ser.dtype == "datetime64[ns]"
+        ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
-        assert ser.dtype == "datetime64[ns]"
+        ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
-        assert ser.dtype == "datetime64[ns]"
+        ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
 
     def test_constructor_dtype_datetime64_5(self):
         # tz-aware (UTC and other tz's)
@@ -1517,23 +1513,19 @@ def test_constructor_dtype_timedelta64(self):
         td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
         assert td.dtype == "object"
 
-        # these will correctly infer a timedelta
-        msg = "containing strings is deprecated"
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([None, NaT, "1 Day"])
-        assert ser.dtype == "timedelta64[ns]"
+        # as of 2.0, these no longer infer timedelta64 based on the strings,
+        #  matching Index behavior
+        ser = Series([None, NaT, "1 Day"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([np.nan, NaT, "1 Day"])
-        assert ser.dtype == "timedelta64[ns]"
+        ser = Series([np.nan, NaT, "1 Day"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([NaT, None, "1 Day"])
-        assert ser.dtype == "timedelta64[ns]"
+        ser = Series([NaT, None, "1 Day"])
+        assert ser.dtype == object
 
-        with tm.assert_produces_warning(FutureWarning, match=msg):
-            ser = Series([NaT, np.nan, "1 Day"])
-        assert ser.dtype == "timedelta64[ns]"
+        ser = Series([NaT, np.nan, "1 Day"])
+        assert ser.dtype == object
 
     # GH 16406
     def test_constructor_mixed_tz(self):

diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py
@@ -207,9 +207,7 @@ def test_to_timedelta_on_missing_values(self):
         )
         tm.assert_series_equal(actual, expected)
 
-        with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
-            ser = Series(["00:00:01", pd.NaT])
-        assert ser.dtype == "m8[ns]"
+        ser = Series(["00:00:01", pd.NaT], dtype="m8[ns]")
         actual = to_timedelta(ser)
         tm.assert_series_equal(actual, expected)