diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a6f7395f5177e..f847c1d827186 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -229,8 +229,8 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`) -- -- +- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) Reshaping diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 507567cf480d7..517d59c399179 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -342,7 +342,9 @@ cdef class SeriesGrouper: index = None else: values = dummy.values - if dummy.dtype != self.arr.dtype: + # GH 23683: datetimetz types are equivalent to datetime types here + if (dummy.dtype != self.arr.dtype + and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c364f069bf53d..926da40deaff2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,8 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) + ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, + is_numeric_dtype, is_scalar) from pandas.core.dtypes.missing import isna, notna from pandas.api.types import ( @@ -766,7 +767,21 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if is_extension_array_dtype(dtype): + if is_datetime64tz_dtype(dtype): + # GH 23683 + # Prior results _may_ have been generated in UTC. + # Ensure we localize to UTC first before converting + # to the target timezone + try: + result = obj._values._from_sequence( + result, dtype='datetime64[ns, UTC]' + ) + result = result.astype(dtype) + except TypeError: + # _try_cast was called at a point where the result + # was already tz-aware + pass + elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. try: diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index b5214b11bddcc..cacfdb7694de1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -512,3 +512,18 @@ def test_agg_list_like_func(): expected = pd.DataFrame({'A': [str(x) for x in range(3)], 'B': [[str(x)] for x in range(3)]}) tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_with_timezone(): + # GH 23683 + df = pd.DataFrame({ + 'tag': [1, 1], + 'date': [ + pd.Timestamp('2018-01-01', tz='UTC'), + pd.Timestamp('2018-01-02', tz='UTC')] + }) + result = df.groupby('tag').agg({'date': lambda e: e.head(1)}) + expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')], + index=pd.Index([1], name='tag'), + columns=['date']) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 255d9a8acf2d0..7a3d189d3020e 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last): assert_frame_equal(result, expected[['id', 'time']]) +@pytest.mark.parametrize('method, ts, alpha', [ + ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'], + ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b'] +]) +def test_first_last_tz_multi_column(method, ts, alpha): + # GH 21603 + df = pd.DataFrame({'group': [1, 1, 2], + 'category_string': pd.Series(list('abc')).astype( + 'category'), + 'datetimetz': pd.date_range('20130101', periods=3, + tz='US/Eastern')}) + result = getattr(df.groupby('group'), method)() + expepcted = pd.DataFrame({'category_string': [alpha, 'c'], + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expepcted) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex