Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: preserve dtype for right/outer merge of datetime with different resolutions #53233

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,14 @@ def _maybe_add_join_keys(
else:
key_col = Index(lvals).where(~mask_left, rvals)
result_dtype = find_common_type([lvals.dtype, rvals.dtype])
if (
lvals.dtype.kind == "M"
and rvals.dtype.kind == "M"
and result_dtype.kind == "O"
):
# TODO(non-nano) Workaround for common_type not dealing
# with different resolutions
result_dtype = key_col.dtype
Comment on lines +997 to +1004
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an ugly band-aid solution: the proper fix is to handle this in find_common_type. But I am not sure if changing find_common_type would be in scope for a bug fix release? (xref #46587 (comment))

cc @jbrockmendel

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing find_common_type would affect concat, so im hesitant to call it a bugfix. But avoiding object may be nicer behavior, so id be open to it as an api change longer-term.

Do we know that both are tznaive at this point? If one is tzaware then object is correct (though i guess we should be able to determine that the merge will be empty?)

I've given a little bit of thought to something similar in the context of Index.get_indexer with mismatched resos. Instead of converting both to object, could find a fill_value not present in left and convert right to left.unit, filling non-convertible entries with that fill_value. The "finding a fill_value" part seems really hacky though. (update: when i wrote this paragraph I thought this chunk of code was in get_merge_keys)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we know that both are tznaive at this point? If one is tzaware then object is correct (though i guess we should be able to determine that the merge will be empty?)

I think so yes, normally we should raise an error earlier in the code if you try to merge on a tz-aware and tz-naive column:

In [1]: df1 = pd.DataFrame({"key": [pd.Timestamp("1970-01-01")], "a": [1]})

In [2]: df2 = pd.DataFrame({"key": [pd.Timestamp("1970-01-01", tz="Europe/Brussels")], "a": [1]})

In [3]: pd.merge(df1, df2)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Input In [3], in <cell line: 1>()
----> 1 pd.merge(df1, df2)

File ~/scipy/pandas/pandas/core/reshape/merge.py:146, in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
    129 @Substitution("\nleft : DataFrame or named Series")
    130 @Appender(_merge_doc, indents=0)
    131 def merge(
   (...)
    144     validate: str | None = None,
    145 ) -> DataFrame:
--> 146     op = _MergeOperation(
    147         left,
    148         right,
    149         how=how,
    150         on=on,
    151         left_on=left_on,
    152         right_on=right_on,
    153         left_index=left_index,
    154         right_index=right_index,
    155         sort=sort,
    156         suffixes=suffixes,
    157         indicator=indicator,
    158         validate=validate,
    159     )
    160     return op.get_result(copy=copy)

File ~/scipy/pandas/pandas/core/reshape/merge.py:735, in _MergeOperation.__init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, indicator, validate)
    727 (
    728     self.left_join_keys,
    729     self.right_join_keys,
    730     self.join_names,
    731 ) = self._get_merge_keys()
    733 # validate the merge keys dtypes. We may need to coerce
    734 # to avoid incompatible dtypes
--> 735 self._maybe_coerce_merge_keys()
    737 # If argument passed to validate,
    738 # check if columns specified as unique
    739 # are in fact unique.
    740 if validate is not None:

File ~/scipy/pandas/pandas/core/reshape/merge.py:1397, in _MergeOperation._maybe_coerce_merge_keys(self)
   1393     raise ValueError(msg)
   1394 elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
   1395     rk.dtype, DatetimeTZDtype
   1396 ):
-> 1397     raise ValueError(msg)
   1398 elif (
   1399     isinstance(lk.dtype, DatetimeTZDtype)
   1400     and isinstance(rk.dtype, DatetimeTZDtype)
   1401 ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
   1402     # allows datetime with different resolutions
   1403     continue

ValueError: You are trying to merge on datetime64[ns] and datetime64[ns, Europe/Brussels] columns for key 'key'. If you wish to proceed you should use pd.concat


if result._is_label_reference(name):
result[name] = result._constructor_sliced(
Expand Down
39 changes: 21 additions & 18 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import numpy as np
import pytest
import pytz

from pandas.core.dtypes.common import is_object_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype
Expand Down Expand Up @@ -2776,24 +2775,28 @@ def test_merge_arrow_and_numpy_dtypes(dtype):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("tzinfo", [None, pytz.timezone("America/Chicago")])
def test_merge_datetime_different_resolution(tzinfo):
@pytest.mark.parametrize("how", ["inner", "left", "outer", "right"])
@pytest.mark.parametrize("tz", [None, "America/Chicago"])
def test_merge_datetime_different_resolution(tz, how):
# https://github.com/pandas-dev/pandas/issues/53200
df1 = DataFrame(
{
"t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo, unit="ns")],
"a": [1],
}
)
df2 = df1.copy()
vals = [
pd.Timestamp(2023, 5, 12, tz=tz),
pd.Timestamp(2023, 5, 13, tz=tz),
pd.Timestamp(2023, 5, 14, tz=tz),
]
df1 = DataFrame({"t": vals[:2], "a": [1.0, 2.0]})
df1["t"] = df1["t"].dt.as_unit("ns")
df2 = DataFrame({"t": vals[1:], "b": [1.0, 2.0]})
df2["t"] = df2["t"].dt.as_unit("s")

expected = DataFrame(
{
"t": [pd.Timestamp(2023, 5, 12, tzinfo=tzinfo)],
"a_x": [1],
"a_y": [1],
}
)
result = df1.merge(df2, on="t")
expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]})
expected["t"] = expected["t"].dt.as_unit("ns")
if how == "inner":
expected = expected.iloc[[1]].reset_index(drop=True)
elif how == "left":
expected = expected.iloc[[0, 1]]
elif how == "right":
expected = expected.iloc[[1, 2]].reset_index(drop=True)

result = df1.merge(df2, on="t", how=how)
tm.assert_frame_equal(result, expected)