ENH: date_range support reso keyword (pandas-dev#49106)

* ENH: date_range support reso keyword * GH ref * pyright ignore * reso->unit * raise if endpoints cant cast losslessly * add assertions * mypy fixup * example with unit * typo fixup
mliu08 · Nov 27, 2022 · 5433bf9 · 5433bf9
1 parent 9639f2b
commit 5433bf9
Show file tree

Hide file tree

Showing 8 changed files with 127 additions and 8 deletions.
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -61,8 +61,10 @@ Other enhancements
 - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
 - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
+- :func:`date_range` now supports a ``unit`` keyword ("s", "ms", "us", or "ns") to specify the desired resolution of the output index (:issue:`49106`)
 - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
 - Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`)
+-
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.notable_bug_fixes:

diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd
@@ -4,7 +4,7 @@ from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
 
 
 cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
-cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
+cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
 cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
 cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1

diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
@@ -10,6 +10,7 @@ def periods_per_second(reso: int) -> int: ...
 def is_supported_unit(reso: int) -> bool: ...
 def npy_unit_to_abbrev(reso: int) -> str: ...
 def get_supported_reso(reso: int) -> int: ...
+def abbrev_to_npy_unit(abbrev: str) -> int: ...
 
 class PeriodDtypeBase:
     _dtype_code: int  # PeriodDtypeCode

diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -336,7 +336,7 @@ cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
         raise NotImplementedError(unit)
 
 
-cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
+cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
     if abbrev == "Y":
         return NPY_DATETIMEUNIT.NPY_FR_Y
     elif abbrev == "M":

diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py
@@ -22,6 +22,7 @@ def generate_regular_range(
     end: Timestamp | Timedelta | None,
     periods: int | None,
     freq: BaseOffset,
+    unit: str = "ns",
 ) -> npt.NDArray[np.intp]:
     """
     Generate a range of dates or timestamps with the spans between dates
@@ -37,14 +38,28 @@ def generate_regular_range(
         Number of periods in produced date range.
     freq : Tick
         Describes space between dates in produced date range.
+    unit : str, default "ns"
+        The resolution the output is meant to represent.
 
     Returns
     -------
-    ndarray[np.int64] Representing nanoseconds.
+    ndarray[np.int64]
+        Representing the given resolution.
     """
     istart = start.value if start is not None else None
     iend = end.value if end is not None else None
-    stride = freq.nanos
+    freq.nanos  # raises if non-fixed frequency
+    td = Timedelta(freq)
+    try:
+        td = td.as_unit(  # pyright: ignore[reportGeneralTypeIssues]
+            unit, round_ok=False
+        )
+    except ValueError as err:
+        raise ValueError(
+            f"freq={freq} is incompatible with unit={unit}. "
+            "Use a lower freq or a higher unit instead."
+        ) from err
+    stride = int(td.value)
 
     if periods is None and istart is not None and iend is not None:
         b = istart

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -42,6 +42,7 @@
     tz_convert_from_utc,
     tzconversion,
 )
+from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
 from pandas._typing import (
     DateTimeErrorChoices,
     IntervalClosedType,
@@ -380,6 +381,8 @@ def _generate_range(  # type: ignore[override]
         ambiguous: TimeAmbiguous = "raise",
         nonexistent: TimeNonexistent = "raise",
         inclusive: IntervalClosedType = "both",
+        *,
+        unit: str | None = None,
     ) -> DatetimeArray:
 
         periods = dtl.validate_periods(periods)
@@ -402,6 +405,17 @@ def _generate_range(  # type: ignore[override]
         if start is NaT or end is NaT:
             raise ValueError("Neither `start` nor `end` can be NaT")
 
+        if unit is not None:
+            if unit not in ["s", "ms", "us", "ns"]:
+                raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'")
+        else:
+            unit = "ns"
+
+        if start is not None and unit is not None:
+            start = start.as_unit(unit, round_ok=False)
+        if end is not None and unit is not None:
+            end = end.as_unit(unit, round_ok=False)
+
         left_inclusive, right_inclusive = validate_inclusive(inclusive)
         start, end = _maybe_normalize_endpoints(start, end, normalize)
         tz = _infer_tz_from_endpoints(start, end, tz)
@@ -416,6 +430,7 @@ def _generate_range(  # type: ignore[override]
             end = _maybe_localize_point(
                 end, end_tz, end, freq, tz, ambiguous, nonexistent
             )
+
         if freq is not None:
             # We break Day arithmetic (fixed 24 hour) here and opt for
             # Day to mean calendar day (23/24/25 hour). Therefore, strip
@@ -427,7 +442,7 @@ def _generate_range(  # type: ignore[override]
                     end = end.tz_localize(None)
 
             if isinstance(freq, Tick):
-                i8values = generate_regular_range(start, end, periods, freq)
+                i8values = generate_regular_range(start, end, periods, freq, unit=unit)
             else:
                 xdr = _generate_range(
                     start=start, end=end, periods=periods, offset=freq
@@ -441,8 +456,13 @@ def _generate_range(  # type: ignore[override]
                 if not timezones.is_utc(tz):
                     # short-circuit tz_localize_to_utc which would make
                     #  an unnecessary copy with UTC but be a no-op.
+                    creso = abbrev_to_npy_unit(unit)
                     i8values = tzconversion.tz_localize_to_utc(
-                        i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent
+                        i8values,
+                        tz,
+                        ambiguous=ambiguous,
+                        nonexistent=nonexistent,
+                        creso=creso,
                     )
 
                 # i8values is localized datetime64 array -> have to convert
@@ -477,8 +497,8 @@ def _generate_range(  # type: ignore[override]
                 if not right_inclusive and len(i8values) and i8values[-1] == end_i8:
                     i8values = i8values[:-1]
 
-        dt64_values = i8values.view("datetime64[ns]")
-        dtype = tz_to_dtype(tz)
+        dt64_values = i8values.view(f"datetime64[{unit}]")
+        dtype = tz_to_dtype(tz, unit=unit)
         return cls._simple_new(dt64_values, freq=freq, dtype=dtype)
 
     # -----------------------------------------------------------------

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -818,6 +818,8 @@ def date_range(
     normalize: bool = False,
     name: Hashable = None,
     inclusive: IntervalClosedType = "both",
+    *,
+    unit: str | None = None,
     **kwargs,
 ) -> DatetimeIndex:
     """
@@ -856,6 +858,10 @@ def date_range(
         Include boundaries; Whether to set each bound as closed or open.
 
         .. versionadded:: 1.4.0
+    unit : str, default None
+        Specify the desired resolution of the result.
+
+        .. versionadded:: 2.0.0
     **kwargs
         For compatibility. Has no effect on the result.
 
@@ -966,6 +972,14 @@ def date_range(
     >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
     DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
                   dtype='datetime64[ns]', freq='D')
+
+    **Specify a unit**
+
+    >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s")
+    DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01',
+                   '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01',
+                   '2817-01-01', '2917-01-01'],
+                  dtype='datetime64[s]', freq='100AS-JAN')
     """
     if freq is None and com.any_none(periods, start, end):
         freq = "D"
@@ -978,6 +992,7 @@ def date_range(
         tz=tz,
         normalize=normalize,
         inclusive=inclusive,
+        unit=unit,
         **kwargs,
     )
     return DatetimeIndex._simple_new(dtarr, name=name)

diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -1184,3 +1184,69 @@ def test_date_range_with_custom_holidays():
         freq=freq,
     )
     tm.assert_index_equal(result, expected)
+
+
+class TestDateRangeNonNano:
+    def test_date_range_reso_validation(self):
+        msg = "'unit' must be one of 's', 'ms', 'us', 'ns'"
+        with pytest.raises(ValueError, match=msg):
+            date_range("2016-01-01", "2016-03-04", periods=3, unit="h")
+
+    def test_date_range_freq_higher_than_reso(self):
+        # freq being higher-resolution than reso is a problem
+        msg = "Use a lower freq or a higher unit instead"
+        with pytest.raises(ValueError, match=msg):
+            #    # TODO give a more useful or informative message?
+            date_range("2016-01-01", "2016-01-02", freq="ns", unit="ms")
+
+    def test_date_range_freq_matches_reso(self):
+        # GH#49106 matching reso is OK
+        dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="ms", unit="ms")
+        rng = np.arange(1_451_606_400_000, 1_451_606_401_001, dtype=np.int64)
+        expected = DatetimeIndex(rng.view("M8[ms]"), freq="ms")
+        tm.assert_index_equal(dti, expected)
+
+        dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="us", unit="us")
+        rng = np.arange(1_451_606_400_000_000, 1_451_606_401_000_001, dtype=np.int64)
+        expected = DatetimeIndex(rng.view("M8[us]"), freq="us")
+        tm.assert_index_equal(dti, expected)
+
+        dti = date_range("2016-01-01", "2016-01-01 00:00:00.001", freq="ns", unit="ns")
+        rng = np.arange(
+            1_451_606_400_000_000_000, 1_451_606_400_001_000_001, dtype=np.int64
+        )
+        expected = DatetimeIndex(rng.view("M8[ns]"), freq="ns")
+        tm.assert_index_equal(dti, expected)
+
+    def test_date_range_freq_lower_than_endpoints(self):
+        start = Timestamp("2022-10-19 11:50:44.719781")
+        end = Timestamp("2022-10-19 11:50:47.066458")
+
+        # start and end cannot be cast to "s" unit without lossy rounding,
+        #  so we do not allow this in date_range
+        with pytest.raises(ValueError, match="Cannot losslessly convert units"):
+            date_range(start, end, periods=3, unit="s")
+
+        # but we can losslessly cast to "us"
+        dti = date_range(start, end, periods=2, unit="us")
+        rng = np.array(
+            [start.as_unit("us").value, end.as_unit("us").value], dtype=np.int64
+        )
+        expected = DatetimeIndex(rng.view("M8[us]"))
+        tm.assert_index_equal(dti, expected)
+
+    def test_date_range_non_nano(self):
+        start = np.datetime64("1066-10-14")  # Battle of Hastings
+        end = np.datetime64("2305-07-13")  # Jean-Luc Picard's birthday
+
+        dti = date_range(start, end, freq="D", unit="s")
+        assert dti.freq == "D"
+        assert dti.dtype == "M8[s]"
+
+        exp = np.arange(
+            start.astype("M8[s]").view("i8"),
+            (end + 1).astype("M8[s]").view("i8"),
+            24 * 3600,
+        ).view("M8[s]")
+
+        tm.assert_numpy_array_equal(dti.to_numpy(), exp)