From bda6baa97ce6458ad98ec997edae684af69957a0 Mon Sep 17 00:00:00 2001 From: Matthew Gilbert Date: Sat, 18 Aug 2018 14:37:45 -0400 Subject: [PATCH] Ensure index used in reindex() is unique pandas.Series.reindex allows non unique indices to be used for reindexing which results in duplicated data. This should not be allowed since it will lead to unexpected downstream behaviour. --- mapping/tests/test_util.py | 8 ++++++++ mapping/util.py | 3 +++ 2 files changed, 11 insertions(+) diff --git a/mapping/tests/test_util.py b/mapping/tests/test_util.py index cc2a57e..76dce60 100644 --- a/mapping/tests/test_util.py +++ b/mapping/tests/test_util.py @@ -766,3 +766,11 @@ def test_reindex(self): exp_rets = pd.Series([0.02, np.NaN], index=widx) assert_series_equal(exp_rets, new_rets) + + # check unique index to avoid duplicates from pd.Series.reindex + idx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'), + (TS('2015-01-04'), 'CLF5')]) + returns = pd.Series([0.02, -0.02], index=idx) + widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'), + (TS('2015-01-03'), 'CLF5')]) + self.assertRaises(ValueError, util.reindex, returns, widx, limit=1) diff --git a/mapping/util.py b/mapping/util.py index ad0ee73..fbe63df 100644 --- a/mapping/util.py +++ b/mapping/util.py @@ -326,6 +326,9 @@ def reindex(returns, index, limit): to be dropped since it is not present in weights. A discussion of this issue available at https://github.com/matthewgilbert/mapping/issues/9 """ + if not index.is_unique: + raise ValueError("'index' must be unique") + cumulative_rets = (returns + 1).groupby(level=1).cumprod() # reindexing can both drop days and introduce NaNs for days not present cumulative_rets = cumulative_rets.reindex(index)