Skip to content

Commit

Permalink
ENH: date_range support reso keyword (pandas-dev#49106)
Browse files Browse the repository at this point in the history
* ENH: date_range support reso keyword

* GH ref

* pyright ignore

* reso->unit

* raise if endpoints cant cast losslessly

* add assertions

* mypy fixup

* example with unit

* typo fixup
  • Loading branch information
jbrockmendel authored and mliu08 committed Nov 27, 2022
1 parent 9639f2b commit 5433bf9
Show file tree
Hide file tree
Showing 8 changed files with 127 additions and 8 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ Other enhancements
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)
- Fix ``test`` optional_extra by adding missing test package ``pytest-asyncio`` (:issue:`48361`)
- :func:`DataFrame.astype` exception message thrown improved to include column name when type conversion is not possible. (:issue:`47571`)
- :func:`date_range` now supports a ``unit`` keyword ("s", "ms", "us", or "ns") to specify the desired resolution of the output index (:issue:`49106`)
- :meth:`DataFrame.to_json` now supports a ``mode`` keyword with supported inputs 'w' and 'a'. Defaulting to 'w', 'a' can be used when lines=True and orient='records' to append record oriented json lines to an existing json file. (:issue:`35849`)
- Added ``name`` parameter to :meth:`IntervalIndex.from_breaks`, :meth:`IntervalIndex.from_arrays` and :meth:`IntervalIndex.from_tuples` (:issue:`48911`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_200.notable_bug_fixes:
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/dtypes.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT


cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/dtypes.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def periods_per_second(reso: int) -> int: ...
def is_supported_unit(reso: int) -> bool: ...
def npy_unit_to_abbrev(reso: int) -> str: ...
def get_supported_reso(reso: int) -> int: ...
def abbrev_to_npy_unit(abbrev: str) -> int: ...

class PeriodDtypeBase:
_dtype_code: int # PeriodDtypeCode
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/dtypes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
raise NotImplementedError(unit)


cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
cpdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev):
if abbrev == "Y":
return NPY_DATETIMEUNIT.NPY_FR_Y
elif abbrev == "M":
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/arrays/_ranges.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def generate_regular_range(
end: Timestamp | Timedelta | None,
periods: int | None,
freq: BaseOffset,
unit: str = "ns",
) -> npt.NDArray[np.intp]:
"""
Generate a range of dates or timestamps with the spans between dates
Expand All @@ -37,14 +38,28 @@ def generate_regular_range(
Number of periods in produced date range.
freq : Tick
Describes space between dates in produced date range.
unit : str, default "ns"
The resolution the output is meant to represent.
Returns
-------
ndarray[np.int64] Representing nanoseconds.
ndarray[np.int64]
Representing the given resolution.
"""
istart = start.value if start is not None else None
iend = end.value if end is not None else None
stride = freq.nanos
freq.nanos # raises if non-fixed frequency
td = Timedelta(freq)
try:
td = td.as_unit( # pyright: ignore[reportGeneralTypeIssues]
unit, round_ok=False
)
except ValueError as err:
raise ValueError(
f"freq={freq} is incompatible with unit={unit}. "
"Use a lower freq or a higher unit instead."
) from err
stride = int(td.value)

if periods is None and istart is not None and iend is not None:
b = istart
Expand Down
28 changes: 24 additions & 4 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
tz_convert_from_utc,
tzconversion,
)
from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
from pandas._typing import (
DateTimeErrorChoices,
IntervalClosedType,
Expand Down Expand Up @@ -380,6 +381,8 @@ def _generate_range( # type: ignore[override]
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
inclusive: IntervalClosedType = "both",
*,
unit: str | None = None,
) -> DatetimeArray:

periods = dtl.validate_periods(periods)
Expand All @@ -402,6 +405,17 @@ def _generate_range( # type: ignore[override]
if start is NaT or end is NaT:
raise ValueError("Neither `start` nor `end` can be NaT")

if unit is not None:
if unit not in ["s", "ms", "us", "ns"]:
raise ValueError("'unit' must be one of 's', 'ms', 'us', 'ns'")
else:
unit = "ns"

if start is not None and unit is not None:
start = start.as_unit(unit, round_ok=False)
if end is not None and unit is not None:
end = end.as_unit(unit, round_ok=False)

left_inclusive, right_inclusive = validate_inclusive(inclusive)
start, end = _maybe_normalize_endpoints(start, end, normalize)
tz = _infer_tz_from_endpoints(start, end, tz)
Expand All @@ -416,6 +430,7 @@ def _generate_range( # type: ignore[override]
end = _maybe_localize_point(
end, end_tz, end, freq, tz, ambiguous, nonexistent
)

if freq is not None:
# We break Day arithmetic (fixed 24 hour) here and opt for
# Day to mean calendar day (23/24/25 hour). Therefore, strip
Expand All @@ -427,7 +442,7 @@ def _generate_range( # type: ignore[override]
end = end.tz_localize(None)

if isinstance(freq, Tick):
i8values = generate_regular_range(start, end, periods, freq)
i8values = generate_regular_range(start, end, periods, freq, unit=unit)
else:
xdr = _generate_range(
start=start, end=end, periods=periods, offset=freq
Expand All @@ -441,8 +456,13 @@ def _generate_range( # type: ignore[override]
if not timezones.is_utc(tz):
# short-circuit tz_localize_to_utc which would make
# an unnecessary copy with UTC but be a no-op.
creso = abbrev_to_npy_unit(unit)
i8values = tzconversion.tz_localize_to_utc(
i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent
i8values,
tz,
ambiguous=ambiguous,
nonexistent=nonexistent,
creso=creso,
)

# i8values is localized datetime64 array -> have to convert
Expand Down Expand Up @@ -477,8 +497,8 @@ def _generate_range( # type: ignore[override]
if not right_inclusive and len(i8values) and i8values[-1] == end_i8:
i8values = i8values[:-1]

dt64_values = i8values.view("datetime64[ns]")
dtype = tz_to_dtype(tz)
dt64_values = i8values.view(f"datetime64[{unit}]")
dtype = tz_to_dtype(tz, unit=unit)
return cls._simple_new(dt64_values, freq=freq, dtype=dtype)

# -----------------------------------------------------------------
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,8 @@ def date_range(
normalize: bool = False,
name: Hashable = None,
inclusive: IntervalClosedType = "both",
*,
unit: str | None = None,
**kwargs,
) -> DatetimeIndex:
"""
Expand Down Expand Up @@ -856,6 +858,10 @@ def date_range(
Include boundaries; Whether to set each bound as closed or open.
.. versionadded:: 1.4.0
unit : str, default None
Specify the desired resolution of the result.
.. versionadded:: 2.0.0
**kwargs
For compatibility. Has no effect on the result.
Expand Down Expand Up @@ -966,6 +972,14 @@ def date_range(
>>> pd.date_range(start='2017-01-01', end='2017-01-04', inclusive='right')
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
dtype='datetime64[ns]', freq='D')
**Specify a unit**
>>> pd.date_range(start="2017-01-01", periods=10, freq="100AS", unit="s")
DatetimeIndex(['2017-01-01', '2117-01-01', '2217-01-01', '2317-01-01',
'2417-01-01', '2517-01-01', '2617-01-01', '2717-01-01',
'2817-01-01', '2917-01-01'],
dtype='datetime64[s]', freq='100AS-JAN')
"""
if freq is None and com.any_none(periods, start, end):
freq = "D"
Expand All @@ -978,6 +992,7 @@ def date_range(
tz=tz,
normalize=normalize,
inclusive=inclusive,
unit=unit,
**kwargs,
)
return DatetimeIndex._simple_new(dtarr, name=name)
Expand Down
66 changes: 66 additions & 0 deletions pandas/tests/indexes/datetimes/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -1184,3 +1184,69 @@ def test_date_range_with_custom_holidays():
freq=freq,
)
tm.assert_index_equal(result, expected)


class TestDateRangeNonNano:
def test_date_range_reso_validation(self):
msg = "'unit' must be one of 's', 'ms', 'us', 'ns'"
with pytest.raises(ValueError, match=msg):
date_range("2016-01-01", "2016-03-04", periods=3, unit="h")

def test_date_range_freq_higher_than_reso(self):
# freq being higher-resolution than reso is a problem
msg = "Use a lower freq or a higher unit instead"
with pytest.raises(ValueError, match=msg):
# # TODO give a more useful or informative message?
date_range("2016-01-01", "2016-01-02", freq="ns", unit="ms")

def test_date_range_freq_matches_reso(self):
# GH#49106 matching reso is OK
dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="ms", unit="ms")
rng = np.arange(1_451_606_400_000, 1_451_606_401_001, dtype=np.int64)
expected = DatetimeIndex(rng.view("M8[ms]"), freq="ms")
tm.assert_index_equal(dti, expected)

dti = date_range("2016-01-01", "2016-01-01 00:00:01", freq="us", unit="us")
rng = np.arange(1_451_606_400_000_000, 1_451_606_401_000_001, dtype=np.int64)
expected = DatetimeIndex(rng.view("M8[us]"), freq="us")
tm.assert_index_equal(dti, expected)

dti = date_range("2016-01-01", "2016-01-01 00:00:00.001", freq="ns", unit="ns")
rng = np.arange(
1_451_606_400_000_000_000, 1_451_606_400_001_000_001, dtype=np.int64
)
expected = DatetimeIndex(rng.view("M8[ns]"), freq="ns")
tm.assert_index_equal(dti, expected)

def test_date_range_freq_lower_than_endpoints(self):
start = Timestamp("2022-10-19 11:50:44.719781")
end = Timestamp("2022-10-19 11:50:47.066458")

# start and end cannot be cast to "s" unit without lossy rounding,
# so we do not allow this in date_range
with pytest.raises(ValueError, match="Cannot losslessly convert units"):
date_range(start, end, periods=3, unit="s")

# but we can losslessly cast to "us"
dti = date_range(start, end, periods=2, unit="us")
rng = np.array(
[start.as_unit("us").value, end.as_unit("us").value], dtype=np.int64
)
expected = DatetimeIndex(rng.view("M8[us]"))
tm.assert_index_equal(dti, expected)

def test_date_range_non_nano(self):
start = np.datetime64("1066-10-14") # Battle of Hastings
end = np.datetime64("2305-07-13") # Jean-Luc Picard's birthday

dti = date_range(start, end, freq="D", unit="s")
assert dti.freq == "D"
assert dti.dtype == "M8[s]"

exp = np.arange(
start.astype("M8[s]").view("i8"),
(end + 1).astype("M8[s]").view("i8"),
24 * 3600,
).view("M8[s]")

tm.assert_numpy_array_equal(dti.to_numpy(), exp)

0 comments on commit 5433bf9

Please sign in to comment.