Skip to content

Commit

Permalink
PERF: Improve Period hashing
Browse files Browse the repository at this point in the history
closes #12817

Author: sinhrks <sinhrks@gmail.com>

Closes #13705 from sinhrks/period_hash and squashes the following commits:

e1fb7f4 [sinhrks] PERF: Improve Period hasing
  • Loading branch information
sinhrks authored and jreback committed Jul 20, 2016
1 parent b25a2a1 commit 016b352
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 29 deletions.
26 changes: 25 additions & 1 deletion asv_bench/benchmarks/period.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pandas import PeriodIndex, date_range
from pandas import Series, Period, PeriodIndex, date_range


class create_period_index_from_date_range(object):
Expand All @@ -7,3 +7,27 @@ class create_period_index_from_date_range(object):
def time_period_index(self):
# Simulate irregular PeriodIndex
PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D')


class period_algorithm(object):
goal_time = 0.2

def setup(self):
data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
Period('2011-03', freq='M'), Period('2011-04', freq='M')]
self.s = Series(data * 1000)
self.i = PeriodIndex(data, freq='M')

def time_period_series_drop_duplicates(self):
self.s.drop_duplicates()

def time_period_index_drop_duplicates(self):
self.i.drop_duplicates()

def time_period_series_value_counts(self):
self.s.value_counts()

def time_period_index_value_counts(self):
self.i.value_counts()


2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,8 @@ Performance Improvements
- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`)
- Improved performance of ``Index.difference`` (:issue:`12044`)
- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`)
- Improved performance of hashing ``Period`` (:issue:`12817`)


.. _whatsnew_0190.bug_fixes:

Expand Down
2 changes: 1 addition & 1 deletion pandas/src/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ cdef class _Period(object):
(type(self).__name__, type(other).__name__))

def __hash__(self):
return hash((self.ordinal, self.freq))
return hash((self.ordinal, self.freqstr))

def _add_delta(self, other):
if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)):
Expand Down
138 changes: 111 additions & 27 deletions pandas/tseries/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,13 +491,15 @@ def test_value_counts_unique(self):
for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']:
idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10)
# create repeated values, 'n'th element is repeated by n+1 times
idx = DatetimeIndex(
np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz)
idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)),
tz=tz)

exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10,
tz=tz)
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
tm.assert_series_equal(idx.value_counts(), expected)

for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10,
tz=tz)
Expand All @@ -507,15 +509,20 @@ def test_value_counts_unique(self):
'2013-01-01 09:00', '2013-01-01 08:00',
'2013-01-01 08:00', pd.NaT], tz=tz)

exp_idx = DatetimeIndex(
['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz)
exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
tz=tz)
expected = Series([3, 2], index=exp_idx)
tm.assert_series_equal(idx.value_counts(), expected)

exp_idx = DatetimeIndex(
['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz)
for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00',
pd.NaT], tz=tz)
expected = Series([3, 2, 1], index=exp_idx)
tm.assert_series_equal(idx.value_counts(dropna=False), expected)

for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(dropna=False),
expected)

tm.assert_index_equal(idx.unique(), exp_idx)

Expand Down Expand Up @@ -654,6 +661,27 @@ def test_drop_duplicates_metadata(self):
self.assert_index_equal(idx, result)
self.assertIsNone(result.freq)

def test_drop_duplicates(self):
# to check Index/Series compat
base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep='last')
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep='last')
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))

def test_take(self):
# GH 10295
idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
Expand Down Expand Up @@ -1303,23 +1331,29 @@ def test_value_counts_unique(self):

exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10)
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
tm.assert_series_equal(idx.value_counts(), expected)

for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

expected = timedelta_range('1 days 09:00:00', freq='H', periods=10)
tm.assert_index_equal(idx.unique(), expected)

idx = TimedeltaIndex(
['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00',
'1 days 08:00:00', '1 days 08:00:00', pd.NaT])
idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00',
'1 days 09:00:00', '1 days 08:00:00',
'1 days 08:00:00', pd.NaT])

exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00'])
expected = Series([3, 2], index=exp_idx)
tm.assert_series_equal(idx.value_counts(), expected)

exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT
])
for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00',
pd.NaT])
expected = Series([3, 2, 1], index=exp_idx)
tm.assert_series_equal(idx.value_counts(dropna=False), expected)

for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(dropna=False), expected)

tm.assert_index_equal(idx.unique(), exp_idx)

Expand Down Expand Up @@ -1454,6 +1488,27 @@ def test_drop_duplicates_metadata(self):
self.assert_index_equal(idx, result)
self.assertIsNone(result.freq)

def test_drop_duplicates(self):
# to check Index/Series compat
base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep='last')
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep='last')
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))

def test_take(self):
# GH 10295
idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx')
Expand Down Expand Up @@ -2121,8 +2176,8 @@ def test_value_counts_unique(self):
# GH 7735
idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
# create repeated values, 'n'th element is repeated by n+1 times
idx = PeriodIndex(
np.repeat(idx.values, range(1, len(idx) + 1)), freq='H')
idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)),
freq='H')

exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00',
'2011-01-01 16:00', '2011-01-01 15:00',
Expand All @@ -2131,24 +2186,31 @@ def test_value_counts_unique(self):
'2011-01-01 10:00',
'2011-01-01 09:00'], freq='H')
expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64')
tm.assert_series_equal(idx.value_counts(), expected)

expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10)
for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

expected = pd.period_range('2011-01-01 09:00', freq='H',
periods=10)
tm.assert_index_equal(idx.unique(), expected)

idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00',
'2013-01-01 09:00', '2013-01-01 08:00',
'2013-01-01 08:00', pd.NaT], freq='H')

exp_idx = PeriodIndex(
['2013-01-01 09:00', '2013-01-01 08:00'], freq='H')
exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'],
freq='H')
expected = Series([3, 2], index=exp_idx)
tm.assert_series_equal(idx.value_counts(), expected)

exp_idx = PeriodIndex(
['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H')
for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(), expected)

exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00',
pd.NaT], freq='H')
expected = Series([3, 2, 1], index=exp_idx)
tm.assert_series_equal(idx.value_counts(dropna=False), expected)

for obj in [idx, Series(idx)]:
tm.assert_series_equal(obj.value_counts(dropna=False), expected)

tm.assert_index_equal(idx.unique(), exp_idx)

Expand All @@ -2164,6 +2226,28 @@ def test_drop_duplicates_metadata(self):
self.assert_index_equal(idx, result)
self.assertEqual(idx.freq, result.freq)

def test_drop_duplicates(self):
# to check Index/Series compat
base = pd.period_range('2011-01-01', '2011-01-31', freq='D',
name='idx')
idx = base.append(base[:5])

res = idx.drop_duplicates()
tm.assert_index_equal(res, base)
res = Series(idx).drop_duplicates()
tm.assert_series_equal(res, Series(base))

res = idx.drop_duplicates(keep='last')
exp = base[5:].append(base[:5])
tm.assert_index_equal(res, exp)
res = Series(idx).drop_duplicates(keep='last')
tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36)))

res = idx.drop_duplicates(keep=False)
tm.assert_index_equal(res, base[5:])
res = Series(idx).drop_duplicates(keep=False)
tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31)))

def test_order_compat(self):
def _check_freq(index, expected_index):
if isinstance(index, PeriodIndex):
Expand Down
13 changes: 13 additions & 0 deletions pandas/tseries/tests/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,19 @@ def test_period_deprecated_freq(self):
p = Period('2016-03-01 09:00', freq=exp)
tm.assertIsInstance(p, Period)

def test_hash(self):
self.assertEqual(hash(Period('2011-01', freq='M')),
hash(Period('2011-01', freq='M')))

self.assertNotEqual(hash(Period('2011-01-01', freq='D')),
hash(Period('2011-01', freq='M')))

self.assertNotEqual(hash(Period('2011-01', freq='3M')),
hash(Period('2011-01', freq='2M')))

self.assertNotEqual(hash(Period('2011-01', freq='M')),
hash(Period('2011-02', freq='M')))

def test_repr(self):
p = Period('Jan-2000')
self.assertIn('2000-01', repr(p))
Expand Down

0 comments on commit 016b352

Please sign in to comment.