API/BUG: infer_dtype_from_scalar with non-nano (#52212)

* API/BUG: infer_dtype_from_scalar with non-nano * update test * xfail on 32bit * fix xfail condition * whatsnew * xfail on windows
pandas-dev · May 18, 2023 · a2bb939 · a2bb939
1 parent b2bb68a
commit a2bb939
Show file tree

Hide file tree

Showing 23 changed files with 166 additions and 91 deletions.
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -317,6 +317,7 @@ Datetimelike
 - Bug in :func:`date_range` when ``freq`` was a :class:`DateOffset` with ``nanoseconds`` (:issue:`46877`)
 - Bug in :meth:`Timestamp.round` with values close to the implementation bounds returning incorrect results instead of raising ``OutOfBoundsDatetime`` (:issue:`51494`)
 - Bug in :meth:`arrays.DatetimeArray.map` and :meth:`DatetimeIndex.map`, where the supplied callable operated array-wise instead of element-wise (:issue:`51977`)
+- Bug in constructing a :class:`Series` or :class:`DataFrame` from a datetime or timedelta scalar always inferring nanosecond resolution instead of inferring from the input (:issue:`52212`)
 - Bug in parsing datetime strings with weekday but no day e.g. "2023 Sept Thu" incorrectly raising ``AttributeError`` instead of ``ValueError`` (:issue:`52659`)
 -
 

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -931,7 +931,7 @@ def rand_series_with_duplicate_datetimeindex() -> Series:
         (Period("2012-02-01", freq="D"), "period[D]"),
         (
             Timestamp("2011-01-01", tz="US/Eastern"),
-            DatetimeTZDtype(tz="US/Eastern"),
+            DatetimeTZDtype(unit="s", tz="US/Eastern"),
         ),
         (Timedelta(seconds=500), "timedelta64[ns]"),
     ]

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -645,7 +645,18 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
         if inferred == dtype:
             return dtype, fv
 
-        return np.dtype("object"), fill_value
+        elif inferred.kind == "m":
+            # different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns]
+            # see if we can losslessly cast it to our dtype
+            unit = np.datetime_data(dtype)[0]
+            try:
+                td = Timedelta(fill_value).as_unit(unit, round_ok=False)
+            except OutOfBoundsTimedelta:
+                return _dtype_obj, fill_value
+            else:
+                return dtype, td.asm8
+
+        return _dtype_obj, fill_value
 
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
@@ -775,8 +786,6 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
     elif isinstance(val, (np.datetime64, dt.datetime)):
         try:
             val = Timestamp(val)
-            if val is not NaT:
-                val = val.as_unit("ns")
         except OutOfBoundsDatetime:
             return _dtype_obj, val
 
@@ -785,16 +794,19 @@ def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
             dtype = val.dtype
             # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
         else:
-            dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
+            dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz)
 
     elif isinstance(val, (np.timedelta64, dt.timedelta)):
         try:
             val = Timedelta(val)
         except (OutOfBoundsTimedelta, OverflowError):
             dtype = _dtype_obj
         else:
-            dtype = np.dtype("m8[ns]")
-            val = np.timedelta64(val.value, "ns")
+            if val is NaT:
+                val = np.timedelta64("NaT", "ns")
+            else:
+                val = val.asm8
+            dtype = val.dtype
 
     elif is_bool(val):
         dtype = np.dtype(np.bool_)

diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py
@@ -61,17 +61,31 @@ def test_infer_dtype_from_complex(complex_dtype):
     assert dtype == np.complex_
 
 
-@pytest.mark.parametrize(
-    "data", [np.datetime64(1, "ns"), Timestamp(1), datetime(2000, 1, 1, 0, 0)]
-)
-def test_infer_dtype_from_datetime(data):
-    dtype, val = infer_dtype_from_scalar(data)
+def test_infer_dtype_from_datetime():
+    dt64 = np.datetime64(1, "ns")
+    dtype, val = infer_dtype_from_scalar(dt64)
     assert dtype == "M8[ns]"
 
+    ts = Timestamp(1)
+    dtype, val = infer_dtype_from_scalar(ts)
+    assert dtype == "M8[ns]"
 
-@pytest.mark.parametrize("data", [np.timedelta64(1, "ns"), Timedelta(1), timedelta(1)])
-def test_infer_dtype_from_timedelta(data):
-    dtype, val = infer_dtype_from_scalar(data)
+    dt = datetime(2000, 1, 1, 0, 0)
+    dtype, val = infer_dtype_from_scalar(dt)
+    assert dtype == "M8[us]"
+
+
+def test_infer_dtype_from_timedelta():
+    td64 = np.timedelta64(1, "ns")
+    dtype, val = infer_dtype_from_scalar(td64)
+    assert dtype == "m8[ns]"
+
+    pytd = timedelta(1)
+    dtype, val = infer_dtype_from_scalar(pytd)
+    assert dtype == "m8[us]"
+
+    td = Timedelta(1)
+    dtype, val = infer_dtype_from_scalar(td)
     assert dtype == "m8[ns]"
 
 
@@ -140,9 +154,9 @@ def test_infer_dtype_from_scalar_errors():
         (b"foo", np.object_),
         (1, np.int64),
         (1.5, np.float_),
-        (np.datetime64("2016-01-01"), np.dtype("M8[ns]")),
-        (Timestamp("20160101"), np.dtype("M8[ns]")),
-        (Timestamp("20160101", tz="UTC"), "datetime64[ns, UTC]"),
+        (np.datetime64("2016-01-01"), np.dtype("M8[s]")),
+        (Timestamp("20160101"), np.dtype("M8[s]")),
+        (Timestamp("20160101", tz="UTC"), "datetime64[s, UTC]"),
     ],
 )
 def test_infer_dtype_from_scalar(value, expected):

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -819,7 +819,7 @@ def test_setitem_single_column_mixed_datetime(self):
         # check our dtypes
         result = df.dtypes
         expected = Series(
-            [np.dtype("float64")] * 3 + [np.dtype("datetime64[ns]")],
+            [np.dtype("float64")] * 3 + [np.dtype("datetime64[s]")],
             index=["foo", "bar", "baz", "timestamp"],
         )
         tm.assert_series_equal(result, expected)

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -154,7 +154,7 @@ def test_setitem_dt64_index_empty_columns(self):
     def test_setitem_timestamp_empty_columns(self):
         # GH#19843
         df = DataFrame(index=range(3))
-        df["now"] = Timestamp("20130101", tz="UTC")
+        df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns")
 
         expected = DataFrame(
             [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
@@ -234,7 +234,7 @@ def test_setitem_dict_preserves_dtypes(self):
             (Interval(left=0, right=5), IntervalDtype("int64", "right")),
             (
                 Timestamp("2011-01-01", tz="US/Eastern"),
-                DatetimeTZDtype(tz="US/Eastern"),
+                DatetimeTZDtype(unit="s", tz="US/Eastern"),
             ),
         ],
     )

diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py
@@ -21,7 +21,7 @@ def test_get_numeric_data_preserve_dtype(self):
         tm.assert_frame_equal(result, expected)
 
     def test_get_numeric_data(self):
-        datetime64name = np.dtype("M8[ns]").name
+        datetime64name = np.dtype("M8[s]").name
         objectname = np.dtype(np.object_).name
 
         df = DataFrame(

diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py
@@ -8,6 +8,10 @@
 import pytest
 
 from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -118,15 +122,21 @@ class TestDataFrameSelectReindex:
     # These are specific reindex-based tests; other indexing tests should go in
     # test_indexing
 
+    @pytest.mark.xfail(
+        not IS64 or is_platform_windows(),
+        reason="Passes int32 values to DatetimeArray in make_na_array on "
+        "windows, 32bit linux builds",
+    )
     @td.skip_array_manager_not_yet_implemented
     def test_reindex_tzaware_fill_value(self):
         # GH#52586
         df = DataFrame([[1]])
 
         ts = pd.Timestamp("2023-04-10 17:32", tz="US/Pacific")
         res = df.reindex([0, 1], axis=1, fill_value=ts)
-        assert res.dtypes[1] == pd.DatetimeTZDtype(tz="US/Pacific")
+        assert res.dtypes[1] == pd.DatetimeTZDtype(unit="s", tz="US/Pacific")
         expected = DataFrame({0: [1], 1: [ts]})
+        expected[1] = expected[1].astype(res.dtypes[1])
         tm.assert_frame_equal(res, expected)
 
         per = ts.tz_localize(None).to_period("s")
@@ -137,8 +147,9 @@ def test_reindex_tzaware_fill_value(self):
 
         interval = pd.Interval(ts, ts + pd.Timedelta(seconds=1))
         res = df.reindex([0, 1], axis=1, fill_value=interval)
-        assert res.dtypes[1] == pd.IntervalDtype("datetime64[ns, US/Pacific]", "right")
+        assert res.dtypes[1] == pd.IntervalDtype("datetime64[s, US/Pacific]", "right")
         expected = DataFrame({0: [1], 1: [interval]})
+        expected[1] = expected[1].astype(res.dtypes[1])
         tm.assert_frame_equal(res, expected)
 
     def test_reindex_copies(self):

diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -656,14 +656,17 @@ def create_cols(name):
             "foo", index=df_float.index, columns=create_cols("object")
         )
         df_dt = DataFrame(
-            Timestamp("20010101"), index=df_float.index, columns=create_cols("date")
+            Timestamp("20010101").as_unit("ns"),
+            index=df_float.index,
+            columns=create_cols("date"),
         )
 
         # add in some nans
         df_float.iloc[30:50, 1:3] = np.nan
 
         # ## this is a bug in read_csv right now ####
         # df_dt.loc[30:50,1:3] = np.nan
+        # FIXME: don't leave commented-out
 
         df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
 
@@ -702,7 +705,9 @@ def test_to_csv_dups_cols(self):
         df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
         df_bool = DataFrame(True, index=df_float.index, columns=range(3))
         df_object = DataFrame("foo", index=df_float.index, columns=range(3))
-        df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
+        df_dt = DataFrame(
+            Timestamp("20010101").as_unit("ns"), index=df_float.index, columns=range(3)
+        )
         df = pd.concat(
             [df_float, df_int, df_bool, df_object, df_dt], axis=1, ignore_index=True
         )

diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -191,20 +191,20 @@ def test_construction_with_mixed(self, float_string_frame):
 
         # check dtypes
         result = df.dtypes
-        expected = Series({"datetime64[ns]": 3})
+        expected = Series({"datetime64[us]": 3})
 
         # mixed-type frames
         float_string_frame["datetime"] = datetime.now()
         float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
-        assert float_string_frame["datetime"].dtype == "M8[ns]"
-        assert float_string_frame["timedelta"].dtype == "m8[ns]"
+        assert float_string_frame["datetime"].dtype == "M8[us]"
+        assert float_string_frame["timedelta"].dtype == "m8[us]"
         result = float_string_frame.dtypes
         expected = Series(
             [np.dtype("float64")] * 4
             + [
                 np.dtype("object"),
-                np.dtype("datetime64[ns]"),
-                np.dtype("timedelta64[ns]"),
+                np.dtype("datetime64[us]"),
+                np.dtype("timedelta64[us]"),
             ],
             index=list("ABCD") + ["foo", "datetime", "timedelta"],
         )
@@ -230,7 +230,7 @@ def test_construction_with_conversions(self):
             },
             index=range(3),
         )
-        assert expected.dtypes["dt1"] == "M8[ns]"
+        assert expected.dtypes["dt1"] == "M8[s]"
         assert expected.dtypes["dt2"] == "M8[s]"
 
         df = DataFrame(index=range(3))

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -97,6 +97,7 @@ def test_constructor_from_2d_datetimearray(self, using_array_manager):
     def test_constructor_dict_with_tzaware_scalar(self):
         # GH#42505
         dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
+        dt = dt.as_unit("ns")
 
         df = DataFrame({"dt": dt}, index=[0])
         expected = DataFrame({"dt": [dt]})
@@ -926,7 +927,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
             (Interval(left=0, right=5), IntervalDtype("int64", "right")),
             (
                 Timestamp("2011-01-01", tz="US/Eastern"),
-                DatetimeTZDtype(tz="US/Eastern"),
+                DatetimeTZDtype(unit="s", tz="US/Eastern"),
             ),
         ],
     )
@@ -1323,7 +1324,7 @@ def test_constructor_unequal_length_nested_list_column(self):
             [[Timestamp("2021-01-01")]],
             [{"x": Timestamp("2021-01-01")}],
             {"x": [Timestamp("2021-01-01")]},
-            {"x": Timestamp("2021-01-01")},
+            {"x": Timestamp("2021-01-01").as_unit("ns")},
         ],
     )
     def test_constructor_one_element_data_list(self, data):
@@ -1814,7 +1815,6 @@ def test_constructor_single_value(self):
     def test_constructor_with_datetimes(self):
         intname = np.dtype(np.int_).name
         floatname = np.dtype(np.float_).name
-        datetime64name = np.dtype("M8[ns]").name
         objectname = np.dtype(np.object_).name
 
         # single item
@@ -1832,7 +1832,7 @@ def test_constructor_with_datetimes(self):
         expected = Series(
             [np.dtype("int64")]
             + [np.dtype(objectname)] * 2
-            + [np.dtype(datetime64name)] * 2,
+            + [np.dtype("M8[s]"), np.dtype("M8[us]")],
             index=list("ABCDE"),
         )
         tm.assert_series_equal(result, expected)
@@ -1912,7 +1912,7 @@ def test_constructor_with_datetimes3(self):
         df = DataFrame({"End Date": dt}, index=[0])
         assert df.iat[0, 0] == dt
         tm.assert_series_equal(
-            df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"})
+            df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"})
         )
 
         df = DataFrame([{"End Date": dt}])
@@ -3047,15 +3047,22 @@ def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
         with pytest.raises(TypeError, match=msg):
             constructor(scalar, dtype=dtype)
 
-    @pytest.mark.xfail(
-        reason="Timestamp constructor has been updated to cast dt64 to non-nano, "
-        "but DatetimeArray._from_sequence has not"
-    )
     @pytest.mark.parametrize("cls", [datetime, np.datetime64])
-    def test_from_out_of_bounds_ns_datetime(self, constructor, cls):
+    def test_from_out_of_bounds_ns_datetime(
+        self, constructor, cls, request, box, frame_or_series
+    ):
         # scalar that won't fit in nanosecond dt64, but will fit in microsecond
+        if box is list or (frame_or_series is Series and box is dict):
+            mark = pytest.mark.xfail(
+                reason="Timestamp constructor has been updated to cast dt64 to "
+                "non-nano, but DatetimeArray._from_sequence has not",
+                strict=True,
+            )
+            request.node.add_marker(mark)
+
         scalar = datetime(9999, 1, 1)
         exp_dtype = "M8[us]"  # pydatetime objects default to this reso
+
         if cls is np.datetime64:
             scalar = np.datetime64(scalar, "D")
             exp_dtype = "M8[s]"  # closest reso to input
@@ -3076,13 +3083,19 @@ def test_out_of_s_bounds_datetime64(self, constructor):
         dtype = tm.get_dtype(result)
         assert dtype == object
 
-    @pytest.mark.xfail(
-        reason="TimedeltaArray constructor has been updated to cast td64 to non-nano, "
-        "but TimedeltaArray._from_sequence has not"
-    )
     @pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
-    def test_from_out_of_bounds_ns_timedelta(self, constructor, cls):
+    def test_from_out_of_bounds_ns_timedelta(
+        self, constructor, cls, request, box, frame_or_series
+    ):
         # scalar that won't fit in nanosecond td64, but will fit in microsecond
+        if box is list or (frame_or_series is Series and box is dict):
+            mark = pytest.mark.xfail(
+                reason="TimedeltaArray constructor has been updated to cast td64 "
+                "to non-nano, but TimedeltaArray._from_sequence has not",
+                strict=True,
+            )
+            request.node.add_marker(mark)
+
         scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
         exp_dtype = "m8[us]"  # smallest reso that fits
         if cls is np.timedelta64:

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -721,7 +721,9 @@ def func_with_date(batch):
     dfg_no_conversion_expected.index.name = "a"
 
     dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
-    dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1])
+    dfg_conversion_expected = DataFrame(
+        {"b": pd.Timestamp(2015, 1, 1).as_unit("ns"), "c": 2}, index=[1]
+    )
     dfg_conversion_expected.index.name = "a"
 
     tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)

diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py
@@ -62,7 +62,7 @@ def test_group_shift_with_fill_value():
 
 def test_group_shift_lose_timezone():
     # GH 30134
-    now_dt = Timestamp.utcnow()
+    now_dt = Timestamp.utcnow().as_unit("ns")
     df = DataFrame({"a": [1, 1], "date": now_dt})
     result = df.groupby("a").shift(0).iloc[0]
     expected = Series({"date": now_dt}, name=result.name)