diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 0f430e249f1c4..7e0931ca1b745 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -290,6 +290,12 @@ Bug Fixes +- Bug in ``DatetimeIndex.value_counts`` doesn't preserve tz (:issue:`7735`) +- Bug in ``PeriodIndex.value_counts`` results in ``Int64Index`` (:issue:`7735`) + + + + diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cb6f200b259db..4abb6ed10d6a7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -197,6 +197,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, from pandas.core.series import Series from pandas.tools.tile import cut + is_period = getattr(values, 'inferred_type', None) == 'period' values = Series(values).values is_category = com.is_categorical_dtype(values.dtype) @@ -212,11 +213,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, values = cat.codes dtype = values.dtype - if com.is_integer_dtype(dtype): - values = com._ensure_int64(values) - keys, counts = htable.value_count_int64(values) - elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period: values = values.view(np.int64) keys, counts = htable.value_count_int64(values) @@ -227,6 +225,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False, # convert the keys back to the dtype we came in keys = keys.astype(dtype) + elif com.is_integer_dtype(dtype): + values = com._ensure_int64(values) + keys, counts = htable.value_count_int64(values) + else: values = com._ensure_object(values) mask = com.isnull(values) diff --git a/pandas/core/base.py b/pandas/core/base.py index 243e34e35784a..d55196b56c784 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -275,8 +275,18 @@ def value_counts(self, normalize=False, sort=True, ascending=False, counts : Series """ from pandas.core.algorithms import value_counts - return value_counts(self.values, sort=sort, ascending=ascending, - normalize=normalize, bins=bins, dropna=dropna) + from pandas.tseries.api import DatetimeIndex, PeriodIndex + result = value_counts(self, sort=sort, ascending=ascending, + normalize=normalize, bins=bins, dropna=dropna) + + if isinstance(self, PeriodIndex): + # preserve freq + result.index = self._simple_new(result.index.values, self.name, + freq=self.freq) + elif isinstance(self, DatetimeIndex): + result.index = self._simple_new(result.index.values, self.name, + tz=getattr(self, 'tz', None)) + return result def unique(self): """ @@ -542,5 +552,3 @@ def __sub__(self, other): def _add_delta(self, other): return NotImplemented - - diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1b7db1451f6cf..494c0ee6b2bec 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -267,8 +267,9 @@ def test_value_counts_unique_nunique(self): # skips int64 because it doesn't allow to include nan or None continue - if o.values.dtype == 'datetime64[ns]' and _np_version_under1p7: - # Unable to assign None + if ((isinstance(o, Int64Index) and not isinstance(o, + (DatetimeIndex, PeriodIndex)))): + # skips int64 because it doesn't allow to include nan or None continue # special assign to the numpy array @@ -283,12 +284,8 @@ def test_value_counts_unique_nunique(self): else: o = klass(np.repeat(values, range(1, len(o) + 1))) - if isinstance(o, DatetimeIndex): - expected_s_na = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1]) - expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1]) - else: - expected_s_na = Series(list(range(10, 2, -1)) +[3], index=values[9:0:-1], dtype='int64') - expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1], dtype='int64') + expected_s_na = Series(list(range(10, 2, -1)) +[3], index=values[9:0:-1], dtype='int64') + expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1], dtype='int64') tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na) tm.assert_series_equal(o.value_counts(), expected_s) @@ -709,6 +706,28 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_value_counts(self): + # GH 7735 + for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: + idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + + exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + tm.assert_series_equal(idx.value_counts(), expected) + + idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + expected = Series([3, 2], index=exp_idx) + tm.assert_series_equal(idx.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + expected = Series([3, 2, 1], index=exp_idx) + tm.assert_series_equal(idx.value_counts(dropna=False), expected) + class TestPeriodIndexOps(Ops): _allowed = '_allow_period_index_ops' @@ -968,6 +987,30 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_value_counts(self): + # GH 7735 + idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + # create repeated values, 'n'th element is repeated by n+1 times + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + + exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', + '2011-01-01 15:00', '2011-01-01 14:00', '2011-01-01 13:00', + '2011-01-01 12:00', '2011-01-01 11:00', '2011-01-01 10:00', + '2011-01-01 09:00'], freq='H') + expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') + tm.assert_series_equal(idx.value_counts(), expected) + + idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', + '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + expected = Series([3, 2], index=exp_idx) + tm.assert_series_equal(idx.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + expected = Series([3, 2, 1], index=exp_idx) + tm.assert_series_equal(idx.value_counts(dropna=False), expected) + if __name__ == '__main__': import nose