Skip to content

Commit

Permalink
BUG: DataFrame from dict with non-nano Timedelta (pandas-dev#48901)
Browse files Browse the repository at this point in the history
* BUG: DataFrame from dict with non-nano Timedelta

* fix __hash__

* test for both hash cases
  • Loading branch information
jbrockmendel authored and mliu08 committed Nov 27, 2022
1 parent 5433bf9 commit 1510982
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 20 deletions.
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/timedeltas.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ cdef class _Timedelta(timedelta):

cpdef timedelta to_pytimedelta(_Timedelta self)
cdef bint _has_ns(self)
cdef bint _is_in_pytimedelta_bounds(self)
cdef _ensure_components(_Timedelta self)
cdef inline bint _compare_mismatched_resos(self, _Timedelta other, op)
cdef _Timedelta _as_creso(self, NPY_DATETIMEUNIT reso, bint round_ok=*)
Expand Down
28 changes: 27 additions & 1 deletion pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1093,8 +1093,27 @@ cdef class _Timedelta(timedelta):
# non-invariant behavior.
# see GH#44504
return hash(self.value)
else:
elif self._is_in_pytimedelta_bounds() and (
self._creso == NPY_FR_ns or self._creso == NPY_DATETIMEUNIT.NPY_FR_us
):
# If we can defer to timedelta.__hash__, do so, as that
# ensures the hash is invariant to our _reso.
# We can only defer for ns and us, as for these two resos we
# call _Timedelta.__new__ with the correct input in
# _timedelta_from_value_and_reso; so timedelta.__hash__
# will be correct
return timedelta.__hash__(self)
else:
# We want to ensure that two equivalent Timedelta objects
# have the same hash. So we try downcasting to the next-lowest
# resolution.
try:
obj = (<_Timedelta>self)._as_creso(<NPY_DATETIMEUNIT>(self._creso + 1))
except OverflowError:
# Doesn't fit, so we're off the hook
return hash(self.value)
else:
return hash(obj)

def __richcmp__(_Timedelta self, object other, int op):
cdef:
Expand Down Expand Up @@ -1152,6 +1171,13 @@ cdef class _Timedelta(timedelta):
else:
raise NotImplementedError(self._creso)

cdef bint _is_in_pytimedelta_bounds(self):
"""
Check if we are within the bounds of datetime.timedelta.
"""
self._ensure_components()
return -999999999 <= self._d and self._d <= 999999999

cdef _ensure_components(_Timedelta self):
"""
compute the components
Expand Down
33 changes: 14 additions & 19 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,30 +846,19 @@ def create_data(constructor):
tm.assert_frame_equal(result_Timestamp, expected)

@pytest.mark.parametrize(
"klass",
"klass,name",
[
pytest.param(
np.timedelta64,
marks=pytest.mark.xfail(
reason="hash mismatch (GH#44504) causes lib.fast_multiget "
"to mess up on dict lookups with equal Timedeltas with "
"mismatched resos"
),
),
timedelta,
Timedelta,
(lambda x: np.timedelta64(x, "D"), "timedelta64"),
(lambda x: timedelta(days=x), "pytimedelta"),
(lambda x: Timedelta(x, "D"), "Timedelta[ns]"),
(lambda x: Timedelta(x, "D").as_unit("s"), "Timedelta[s]"),
],
)
def test_constructor_dict_timedelta64_index(self, klass):
def test_constructor_dict_timedelta64_index(self, klass, name):
# GH 10160
td_as_int = [1, 2, 3, 4]

if klass is timedelta:
constructor = lambda x: timedelta(days=x)
else:
constructor = lambda x: klass(x, "D")

data = {i: {constructor(s): 2 * i} for i, s in enumerate(td_as_int)}
data = {i: {klass(s): 2 * i} for i, s in enumerate(td_as_int)}

expected = DataFrame(
[
Expand All @@ -881,7 +870,13 @@ def test_constructor_dict_timedelta64_index(self, klass):
index=[Timedelta(td, "D") for td in td_as_int],
)

result = DataFrame(data)
if name == "Timedelta[s]":
# TODO(2.0): passing index here shouldn't be necessary, is for now
# otherwise we raise in _extract_index
result = DataFrame(data, index=expected.index)
else:
result = DataFrame(data)

tm.assert_frame_equal(result, expected)

def test_constructor_period_dict(self):
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest

from pandas._libs import (
Timedelta,
lib,
writers as libwriters,
)
Expand Down Expand Up @@ -42,6 +43,34 @@ def test_fast_unique_multiple_list_gen_sort(self):
out = lib.fast_unique_multiple_list_gen(gen, sort=False)
tm.assert_numpy_array_equal(np.array(out), expected)

def test_fast_multiget_timedelta_resos(self):
# This will become relevant for test_constructor_dict_timedelta64_index
# once Timedelta constructor preserves reso when passed a
# np.timedelta64 object
td = Timedelta(days=1)

mapping1 = {td: 1}
mapping2 = {td.as_unit("s"): 1}

oindex = Index([td * n for n in range(3)])._values.astype(object)

expected = lib.fast_multiget(mapping1, oindex)
result = lib.fast_multiget(mapping2, oindex)
tm.assert_numpy_array_equal(result, expected)

# case that can't be cast to td64ns
td = Timedelta(np.timedelta64(400, "Y"))
assert hash(td) == hash(td.as_unit("ms"))
assert hash(td) == hash(td.as_unit("us"))
mapping1 = {td: 1}
mapping2 = {td.as_unit("ms"): 1}

oindex = Index([td * n for n in range(3)])._values.astype(object)

expected = lib.fast_multiget(mapping1, oindex)
result = lib.fast_multiget(mapping2, oindex)
tm.assert_numpy_array_equal(result, expected)


class TestIndexing:
def test_maybe_indices_to_slice_left_edge(self):
Expand Down

0 comments on commit 1510982

Please sign in to comment.