From 5433bf9c3fdddb11c08711b60bb681ca6b1689f9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Nov 2022 15:32:14 -0800 Subject: [PATCH] ENH: date_range support reso keyword (#49106) * ENH: date_range support reso keyword * GH ref * pyright ignore * reso->unit * raise if endpoints cant cast losslessly * add assertions * mypy fixup * example with unit * typo fixup --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/_libs/tslibs/dtypes.pxd | 2 +- pandas/_libs/tslibs/dtypes.pyi | 1 + pandas/_libs/tslibs/dtypes.pyx | 2 +- pandas/core/arrays/_ranges.py | 19 +++++- pandas/core/arrays/datetimes.py | 28 ++++++-- pandas/core/indexes/datetimes.py | 15 +++++ .../indexes/datetimes/test_date_range.py | 66 +++++++++++++++++++ 8 files changed, 127 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index dd0609d3b1f134..331c2421a31ef4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -61,8 +61,10 @@ Other enhancements - :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`) - Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`) - :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`) +- :func:`date_range` now supports a ``unit`` keyword ("s", "ms", "us", or "ns") to specify the desired resolution of the output index (:issue:`49106`) - :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`) - Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.notable_bug_fixes: diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 11b92447f5011b..3e3f206685d377 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -4,7 +4,7 @@ from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) -cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) +cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev) cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index a54db51136d077..b872241d79a544 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -10,6 +10,7 @@ def periods_per_second(reso: int) -> int: ... def is_supported_unit(reso: int) -> bool: ... def npy_unit_to_abbrev(reso: int) -> str: ... def get_supported_reso(reso: int) -> int: ... +def abbrev_to_npy_unit(abbrev: str) -> int: ... class PeriodDtypeBase: _dtype_code: int # PeriodDtypeCode diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 357227de2fc2c8..2df5349f452728 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -336,7 +336,7 @@ cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit): raise NotImplementedError(unit) -cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev): +cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev): if abbrev == "Y": return NPY_DATETIMEUNIT.NPY_FR_Y elif abbrev == "M": diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index f50ce8a9ea8951..baf8470a866ffe 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -22,6 +22,7 @@ def generate_regular_range( end: Timestamp | Timedelta | None, periods: int | None, freq: BaseOffset, + unit: str = "ns", ) -> npt.NDArray[np.intp]: """ Generate a range of dates or timestamps with the spans between dates @@ -37,14 +38,28 @@ def generate_regular_range( Number of periods in produced date range. freq : Tick Describes space between dates in produced date range. + unit : str, default "ns" + The resolution the output is meant to represent. Returns ------- - ndarray[np.int64] Representing nanoseconds. + ndarray[np.int64] + Representing the given resolution. """ istart = start.value if start is not None else None iend = end.value if end is not None else None - stride = freq.nanos + freq.nanos # raises if non-fixed frequency + td = Timedelta(freq) + try: + td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues] + unit, round_ok=False + ) + except ValueError as err: + raise ValueError( + f"freq={freq} is incompatible with unit={unit}. " + "Use a lower freq or a higher unit instead." + ) from err + stride = int(td.value) if periods is None and istart is not None and iend is not None: b = istart diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ed0a7df41c28d5..4ea6b9ceee833c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -42,6 +42,7 @@ tz_convert_from_utc, tzconversion, ) +from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit from pandas._typing import ( DateTimeErrorChoices, IntervalClosedType, @@ -380,6 +381,8 @@ def _generate_range( # type: ignore[override] ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", inclusive: IntervalClosedType = "both", + *, + unit: str | None = None, ) -> DatetimeArray: periods = dtl.validate_periods(periods) @@ -402,6 +405,17 @@ def _generate_range( # type: ignore[override] if start is NaT or end is NaT: raise ValueError("Neither `start` nor `end` can be NaT") + if unit is not None: + if unit not in ["s", "ms", "us", "ns"]: + raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'") + else: + unit = "ns" + + if start is not None and unit is not None: + start = start.as_unit(unit, round_ok=False) + if end is not None and unit is not None: + end = end.as_unit(unit, round_ok=False) + left_inclusive, right_inclusive = validate_inclusive(inclusive) start, end = _maybe_normalize_endpoints(start, end, normalize) tz = _infer_tz_from_endpoints(start, end, tz) @@ -416,6 +430,7 @@ def _generate_range( # type: ignore[override] end = _maybe_localize_point( end, end_tz, end, freq, tz, ambiguous, nonexistent ) + if freq is not None: # We break Day arithmetic (fixed 24 hour) here and opt for # Day to mean calendar day (23/24/25 hour). Therefore, strip @@ -427,7 +442,7 @@ def _generate_range( # type: ignore[override] end = end.tz_localize(None) if isinstance(freq, Tick): - i8values = generate_regular_range(start, end, periods, freq) + i8values = generate_regular_range(start, end, periods, freq, unit=unit) else: xdr = _generate_range( start=start, end=end, periods=periods, offset=freq @@ -441,8 +456,13 @@ def _generate_range( # type: ignore[override] if not timezones.is_utc(tz): # short-circuit tz_localize_to_utc which would make # an unnecessary copy with UTC but be a no-op. + creso = abbrev_to_npy_unit(unit) i8values = tzconversion.tz_localize_to_utc( - i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent + i8values, + tz, + ambiguous=ambiguous, + nonexistent=nonexistent, + creso=creso, ) # i8values is localized datetime64 array -> have to convert @@ -477,8 +497,8 @@ def _generate_range( # type: ignore[override] if not right_inclusive and len(i8values) and i8values[-1] == end_i8: i8values = i8values[:-1] - dt64_values = i8values.view("datetime64[ns]") - dtype = tz_to_dtype(tz) + dt64_values = i8values.view(f"datetime64[{unit}]") + dtype = tz_to_dtype(tz, unit=unit) return cls._simple_new(dt64_values, freq=freq, dtype=dtype) # ----------------------------------------------------------------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ddfcafae3d8c13..57cbc76d1c2fa9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -818,6 +818,8 @@ def date_range( normalize: bool = False, name: Hashable = None, inclusive: IntervalClosedType = "both", + *, + unit: str | None = None, **kwargs, ) -> DatetimeIndex: """ @@ -856,6 +858,10 @@ def date_range( Include boundaries; Whether to set each bound as closed or open. .. versionadded:: 1.4.0 + unit : str, default None + Specify the desired resolution of the result. + + .. versionadded:: 2.0.0 **kwargs For compatibility. Has no effect on the result. @@ -966,6 +972,14 @@ def date_range( >>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right') DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') + + **Specify a unit** + + >>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s") + DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01', + '2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01', + '2817-01-01', '2917-01-01'], + dtype='datetime64[s]', freq='100AS-JAN') """ if freq is None and com.any_none(periods, start, end): freq = "D" @@ -978,6 +992,7 @@ def date_range( tz=tz, normalize=normalize, inclusive=inclusive, + unit=unit, **kwargs, ) return DatetimeIndex._simple_new(dtarr, name=name) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index adbf6c715fef65..e90f9fb2b5e363 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -1184,3 +1184,69 @@ def test_date_range_with_custom_holidays(): freq=freq, ) tm.assert_index_equal(result, expected) + + +class TestDateRangeNonNano: + def test_date_range_reso_validation(self): + msg = "'unit' must be one of 's', 'ms', 'us', 'ns'" + with pytest.raises(ValueError, match=msg): + date_range("2016-01-01", "2016-03-04", periods=3, unit="h") + + def test_date_range_freq_higher_than_reso(self): + # freq being higher-resolution than reso is a problem + msg = "Use a lower freq or a higher unit instead" + with pytest.raises(ValueError, match=msg): + # # TODO give a more useful or informative message? + date_range("2016-01-01", "2016-01-02", freq="ns", unit="ms") + + def test_date_range_freq_matches_reso(self): + # GH#49106 matching reso is OK + dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="ms", unit="ms") + rng = np.arange(1_451_606_400_000, 1_451_606_401_001, dtype=np.int64) + expected = DatetimeIndex(rng.view("M8[ms]"), freq="ms") + tm.assert_index_equal(dti, expected) + + dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="us", unit="us") + rng = np.arange(1_451_606_400_000_000, 1_451_606_401_000_001, dtype=np.int64) + expected = DatetimeIndex(rng.view("M8[us]"), freq="us") + tm.assert_index_equal(dti, expected) + + dti = date_range("2016-01-01", "2016-01-01 00:00:00.001", freq="ns", unit="ns") + rng = np.arange( + 1_451_606_400_000_000_000, 1_451_606_400_001_000_001, dtype=np.int64 + ) + expected = DatetimeIndex(rng.view("M8[ns]"), freq="ns") + tm.assert_index_equal(dti, expected) + + def test_date_range_freq_lower_than_endpoints(self): + start = Timestamp("2022-10-19 11:50:44.719781") + end = Timestamp("2022-10-19 11:50:47.066458") + + # start and end cannot be cast to "s" unit without lossy rounding, + # so we do not allow this in date_range + with pytest.raises(ValueError, match="Cannot losslessly convert units"): + date_range(start, end, periods=3, unit="s") + + # but we can losslessly cast to "us" + dti = date_range(start, end, periods=2, unit="us") + rng = np.array( + [start.as_unit("us").value, end.as_unit("us").value], dtype=np.int64 + ) + expected = DatetimeIndex(rng.view("M8[us]")) + tm.assert_index_equal(dti, expected) + + def test_date_range_non_nano(self): + start = np.datetime64("1066-10-14") # Battle of Hastings + end = np.datetime64("2305-07-13") # Jean-Luc Picard's birthday + + dti = date_range(start, end, freq="D", unit="s") + assert dti.freq == "D" + assert dti.dtype == "M8[s]" + + exp = np.arange( + start.astype("M8[s]").view("i8"), + (end + 1).astype("M8[s]").view("i8"), + 24 * 3600, + ).view("M8[s]") + + tm.assert_numpy_array_equal(dti.to_numpy(), exp)