diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
index a6f7395f5177e..f847c1d827186 100644
--- a/doc/source/whatsnew/v0.25.0.rst
+++ b/doc/source/whatsnew/v0.25.0.rst
@@ -229,8 +229,8 @@ Groupby/Resample/Rolling
 
 - Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
 - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`)
--
--
+- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`)
+- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
 
 
 Reshaping
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
index 507567cf480d7..517d59c399179 100644
--- a/pandas/_libs/reduction.pyx
+++ b/pandas/_libs/reduction.pyx
@@ -342,7 +342,9 @@ cdef class SeriesGrouper:
             index = None
         else:
             values = dummy.values
-            if dummy.dtype != self.arr.dtype:
+            # GH 23683: datetimetz types are equivalent to datetime types here
+            if (dummy.dtype != self.arr.dtype
+                    and values.dtype != self.arr.dtype):
                 raise ValueError('Dummy array must be same dtype')
             if not values.flags.contiguous:
                 values = values.copy()
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index c364f069bf53d..926da40deaff2 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -26,7 +26,8 @@ class providing the base-class of operations.
 
 from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 from pandas.core.dtypes.common import (
-    ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
+    ensure_float, is_datetime64tz_dtype, is_extension_array_dtype,
+    is_numeric_dtype, is_scalar)
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.api.types import (
@@ -766,7 +767,21 @@ def _try_cast(self, result, obj, numeric_only=False):
             dtype = obj.dtype
 
         if not is_scalar(result):
-            if is_extension_array_dtype(dtype):
+            if is_datetime64tz_dtype(dtype):
+                # GH 23683
+                # Prior results _may_ have been generated in UTC.
+                # Ensure we localize to UTC first before converting
+                # to the target timezone
+                try:
+                    result = obj._values._from_sequence(
+                        result, dtype='datetime64[ns, UTC]'
+                    )
+                    result = result.astype(dtype)
+                except TypeError:
+                    # _try_cast was called at a point where the result
+                    # was already tz-aware
+                    pass
+            elif is_extension_array_dtype(dtype):
                 # The function can return something of any type, so check
                 # if the type is compatible with the calling EA.
                 try:
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index b5214b11bddcc..cacfdb7694de1 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -512,3 +512,18 @@ def test_agg_list_like_func():
     expected = pd.DataFrame({'A': [str(x) for x in range(3)],
                              'B': [[str(x)] for x in range(3)]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_agg_lambda_with_timezone():
+    # GH 23683
+    df = pd.DataFrame({
+        'tag': [1, 1],
+        'date': [
+            pd.Timestamp('2018-01-01', tz='UTC'),
+            pd.Timestamp('2018-01-02', tz='UTC')]
+    })
+    result = df.groupby('tag').agg({'date': lambda e: e.head(1)})
+    expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')],
+                            index=pd.Index([1], name='tag'),
+                            columns=['date'])
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py
index 255d9a8acf2d0..7a3d189d3020e 100644
--- a/pandas/tests/groupby/test_nth.py
+++ b/pandas/tests/groupby/test_nth.py
@@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last):
     assert_frame_equal(result, expected[['id', 'time']])
 
 
+@pytest.mark.parametrize('method, ts, alpha', [
+    ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'],
+    ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b']
+])
+def test_first_last_tz_multi_column(method, ts, alpha):
+    # GH 21603
+    df = pd.DataFrame({'group': [1, 1, 2],
+                       'category_string': pd.Series(list('abc')).astype(
+                           'category'),
+                       'datetimetz': pd.date_range('20130101', periods=3,
+                                                   tz='US/Eastern')})
+    result = getattr(df.groupby('group'), method)()
+    expepcted = pd.DataFrame({'category_string': [alpha, 'c'],
+                              'datetimetz': [ts,
+                                             Timestamp('2013-01-03',
+                                                       tz='US/Eastern')]},
+                             index=pd.Index([1, 2], name='group'))
+    assert_frame_equal(result, expepcted)
+
+
 def test_nth_multi_index_as_expected():
     # PR 9090, related to issue 8979
     # test nth on MultiIndex