Skip to content

Commit

Permalink
"Backport PR #51978 on branch 2.0.x (BUG/API: preserve non-nano in fa…
Browse files Browse the repository at this point in the history
…ctorize/unique)" (#52002)
  • Loading branch information
jbrockmendel committed Mar 16, 2023
1 parent e28ba0e commit f184236
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 50 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ Other API changes
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)
- :func:`factorize` and :func:`unique` preserve the original dtype when passed numpy timedelta64 or datetime64 with non-nanosecond resolution (:issue:`48670`)

.. note::

Expand Down
13 changes: 2 additions & 11 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike,
infer_dtype_from_array,
sanitize_to_nanoseconds,
)
from pandas.core.dtypes.common import (
ensure_float64,
Expand All @@ -45,7 +44,6 @@
is_bool_dtype,
is_categorical_dtype,
is_complex_dtype,
is_datetime64_dtype,
is_extension_array_dtype,
is_float_dtype,
is_integer,
Expand All @@ -55,7 +53,6 @@
is_object_dtype,
is_scalar,
is_signed_integer_dtype,
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
Expand Down Expand Up @@ -174,8 +171,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:

# datetimelike
elif needs_i8_conversion(values.dtype):
if isinstance(values, np.ndarray):
values = sanitize_to_nanoseconds(values)
npvalues = values.view("i8")
npvalues = cast(np.ndarray, npvalues)
return npvalues
Expand Down Expand Up @@ -213,11 +208,6 @@ def _reconstruct_data(
values = cls._from_sequence(values, dtype=dtype)

else:
if is_datetime64_dtype(dtype):
dtype = np.dtype("datetime64[ns]")
elif is_timedelta64_dtype(dtype):
dtype = np.dtype("timedelta64[ns]")

values = values.astype(dtype, copy=False)

return values
Expand Down Expand Up @@ -768,7 +758,8 @@ def factorize(
codes, uniques = values.factorize(sort=sort)
return codes, uniques

elif not isinstance(values.dtype, np.dtype):
elif not isinstance(values, np.ndarray):
# i.e. ExtensionArray
codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)

else:
Expand Down
20 changes: 0 additions & 20 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
OutOfBoundsTimedelta,
Timedelta,
Timestamp,
astype_overflowsafe,
get_unit_from_dtype,
is_supported_unit,
)
Expand All @@ -50,8 +49,6 @@
)

from pandas.core.dtypes.common import (
DT64NS_DTYPE,
TD64NS_DTYPE,
ensure_int8,
ensure_int16,
ensure_int32,
Expand Down Expand Up @@ -1231,23 +1228,6 @@ def maybe_cast_to_datetime(
return dta


def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray:
"""
Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
"""
dtype = values.dtype
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
values = astype_overflowsafe(values, dtype=DT64NS_DTYPE)

elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
values = astype_overflowsafe(values, dtype=TD64NS_DTYPE)

elif copy:
values = values.copy()

return values


def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
"""
Convert dtypes with granularity less than nanosecond to nanosecond
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def _convert_and_box_cache(
"""
from pandas import Series

result = Series(arg).map(cache_array)
result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
return _box_as_indexlike(result._values, utc=False, name=name)


Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/indexes/datetimes/methods/test_factorize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest

from pandas import (
DatetimeIndex,
Expand Down Expand Up @@ -105,3 +106,20 @@ def test_factorize_dst(self, index_or_series):
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

@pytest.mark.parametrize("sort", [True, False])
def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort):
# GH#51978 case that does not go through the fastpath based on
# non-None freq
tz = tz_naive_fixture
idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]]
exp_codes, exp_uniques = idx.factorize(sort=sort)

res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort)

tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))

res_codes, res_uniques = idx.as_unit("s").to_series().factorize(sort=sort)
tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1657,8 +1657,8 @@ def date_parser(dt, time):
datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
expected = DataFrame(
data={"rxstatus": ["00E80000"] * 3},
index=MultiIndex.from_tuples(
[(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)],
index=MultiIndex.from_arrays(
[datetimes, [126, 23, 13]],
names=["datetime", "prn"],
),
)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def test_object_factorize(self, writable):

def test_datetime64_factorize(self, writable):
# GH35650 Verify whether read-only datetime64 array can be factorized
data = np.array([np.datetime64("2020-01-01T00:00:00.000")])
data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
data.setflags(write=writable)
expected_codes = np.array([0], dtype=np.intp)
expected_uniques = np.array(
Expand Down Expand Up @@ -620,13 +620,13 @@ def test_datetime64_dtype_array_returned(self):
def test_datetime_non_ns(self):
a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
result = pd.unique(a)
expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
expected = np.array(["2000", "2001"], dtype="datetime64[s]")
tm.assert_numpy_array_equal(result, expected)

def test_timedelta_non_ns(self):
a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
result = pd.unique(a)
expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
expected = np.array([2000, 2001], dtype="timedelta64[s]")
tm.assert_numpy_array_equal(result, expected)

def test_timedelta64_dtype_array_returned(self):
Expand Down
34 changes: 21 additions & 13 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,31 +1076,39 @@ def test_to_datetime_array_of_dt64s(self, cache, unit):
# Assuming all datetimes are in bounds, to_datetime() returns
# an array that is equal to Timestamp() parsing
result = to_datetime(dts, cache=cache)
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")
if cache:
# FIXME: behavior should not depend on cache
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]")
else:
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")

tm.assert_index_equal(result, expected)

# A list of datetimes where the last one is out of bounds
dts_with_oob = dts + [np.datetime64("9999-01-01")]

msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00"
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(dts_with_oob, errors="raise")
# As of GH#?? we do not raise in this case
to_datetime(dts_with_oob, errors="raise")

tm.assert_index_equal(
to_datetime(dts_with_oob, errors="coerce", cache=cache),
DatetimeIndex(
result = to_datetime(dts_with_oob, errors="coerce", cache=cache)
if not cache:
# FIXME: shouldn't depend on cache!
expected = DatetimeIndex(
[Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30
+ [NaT],
),
)
)
else:
expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]"))
tm.assert_index_equal(result, expected)

# With errors='ignore', out of bounds datetime64s
# are converted to their .item(), which depending on the version of
# numpy is either a python datetime.datetime or datetime.date
tm.assert_index_equal(
to_datetime(dts_with_oob, errors="ignore", cache=cache),
Index(dts_with_oob),
)
result = to_datetime(dts_with_oob, errors="ignore", cache=cache)
if not cache:
# FIXME: shouldn't depend on cache!
expected = Index(dts_with_oob)
tm.assert_index_equal(result, expected)

def test_out_of_bounds_errors_ignore(self):
# https://github.com/pandas-dev/pandas/issues/50587
Expand Down

0 comments on commit f184236

Please sign in to comment.