Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Merge timezone aware data with DST #22825

Merged
merged 11 commits into from
Oct 1, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,7 @@ Reshaping
- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`)
- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`)
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)

Build Changes
^^^^^^^^^^^^^
Expand Down
59 changes: 39 additions & 20 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op):
except TypeError:
return result

def _ensure_localized(self, result, ambiguous='raise'):
def _ensure_localized(self, arg, ambiguous='raise', from_utc=False):
"""
ensure that we are re-localized

Expand All @@ -286,9 +286,11 @@ def _ensure_localized(self, result, ambiguous='raise'):

Parameters
----------
result : DatetimeIndex / i8 ndarray
ambiguous : str, bool, or bool-ndarray
default 'raise'
arg : DatetimeIndex / i8 ndarray
ambiguous : str, bool, or bool-ndarray, default 'raise'
from_utc : bool, default False
If True, localize the i8 ndarray to UTC first before converting to
the appropriate tz. If False, localize directly to the tz.

Returns
-------
Expand All @@ -297,10 +299,13 @@ def _ensure_localized(self, result, ambiguous='raise'):

# reconvert to local tz
if getattr(self, 'tz', None) is not None:
if not isinstance(result, ABCIndexClass):
result = self._simple_new(result)
result = result.tz_localize(self.tz, ambiguous=ambiguous)
return result
if not isinstance(arg, ABCIndexClass):
arg = self._simple_new(arg)
if from_utc:
arg = arg.tz_localize('UTC').tz_convert(self.tz)
else:
arg = arg.tz_localize(self.tz, ambiguous=ambiguous)
return arg

def _box_values_as_index(self):
"""
Expand Down Expand Up @@ -622,11 +627,11 @@ def repeat(self, repeats, *args, **kwargs):

@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
def where(self, cond, other=None):
other = _ensure_datetimelike_to_i8(other)
values = _ensure_datetimelike_to_i8(self)
other = _ensure_datetimelike_to_i8(other, to_utc=True)
values = _ensure_datetimelike_to_i8(self, to_utc=True)
result = np.where(cond, values, other).astype('i8')

result = self._ensure_localized(result)
result = self._ensure_localized(result, from_utc=True)
return self._shallow_copy(result,
**self._get_attributes_dict())

Expand Down Expand Up @@ -695,23 +700,37 @@ def astype(self, dtype, copy=True):
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)


def _ensure_datetimelike_to_i8(other):
""" helper for coercing an input scalar or array to i8 """
def _ensure_datetimelike_to_i8(other, to_utc=False):
"""
helper for coercing an input scalar or array to i8

Parameters
----------
other : 1d array
to_utc : bool, default False
If True, convert the values to UTC before extracting the i8 values
If False, extract the i8 values directly.

Returns
-------
i8 1d array
"""
if is_scalar(other) and isna(other):
other = iNaT
return iNaT
elif isinstance(other, ABCIndexClass):
# convert tz if needed
if getattr(other, 'tz', None) is not None:
other = other.tz_localize(None).asi8
else:
other = other.asi8
if to_utc:
other = other.tz_convert('UTC')
else:
other = other.tz_localize(None)
else:
try:
other = np.array(other, copy=False).view('i8')
return np.array(other, copy=False).view('i8')
except TypeError:
# period array cannot be coerces to int
other = Index(other).asi8
return other
other = Index(other)
return other.asi8


def wrap_arithmetic_op(self, other, result):
Expand Down
40 changes: 29 additions & 11 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'), values[3]])
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize("fill_val,exp_dtype", [
(pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
(pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
ids=['datetime64', 'datetime64tz'])
def test_where_index_datetime(self, fill_val, exp_dtype):
def test_where_index_datetime(self):
fill_val = pd.Timestamp('2012-01-01')
exp_dtype = 'datetime64[ns]'
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
Expand All @@ -613,13 +611,33 @@ def test_where_index_datetime(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04')])

if fill_val.tz:
self._assert_where_conversion(obj, cond, values, exp,
'datetime64[ns]')
pytest.xfail("ToDo: do not ignore timezone, must be object")
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
pytest.xfail("datetime64 + datetime64 -> datetime64 must support"
" scalar")

@pytest.mark.xfail(
reason="GH 22839: do not ignore timezone, must be object")
def test_where_index_datetimetz(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was bizarrely structured above. It was given the wrong arguments for everything to pass before it hit the xfail...

Good news is that one of the cases is passing, so I am splitting the tz-aware and tz-naive case and properly xfailing the tz-aware case (which was already failing on master and is orthogonal to this change)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there an issue for this case? if not can you create one and reference it here

fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
exp_dtype = np.object
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2011-01-04')])
assert obj.dtype == 'datetime64[ns]'
cond = pd.Index([True, False, True, False])

msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
"of some kind")
with tm.assert_raises_regex(TypeError, msg):
obj.where(cond, fill_val)

values = pd.Index(pd.date_range(fill_val, periods=4))
exp = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2012-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04', tz='US/Eastern')],
dtype=exp_dtype)

self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

def test_where_index_complex128(self):
pass
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self):
assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'

def test_merge_datetime64tz_with_dst_transition(self):
# GH 18885
df1 = pd.DataFrame(pd.date_range(
'2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
columns=['date'])
df1['value'] = 1
df2 = pd.DataFrame({
'date': pd.to_datetime([
'2017-10-29 03:00:00', '2017-10-29 04:00:00',
'2017-10-29 05:00:00'
]),
'value': 2
})
df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
'Europe/Madrid')
result = pd.merge(df1, df2, how='outer', on='date')
expected = pd.DataFrame({
'date': pd.date_range(
'2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
'value_x': [1] * 4 + [np.nan] * 3,
'value_y': [np.nan] * 4 + [2] * 3
})
assert_frame_equal(result, expected)

def test_merge_non_unique_period_index(self):
# GH #16871
index = pd.period_range('2016-01-01', periods=16, freq='M')
Expand Down