Add functionality to flatten/unflatten weights

Resolves: #5
matthewgilbert · Apr 14, 2018 · ed3dac2 · ed3dac2
1 parent 13bfaca
commit ed3dac2
Show file tree

Hide file tree

Showing 2 changed files with 189 additions and 1 deletion.
diff --git a/mapping/tests/test_util.py b/mapping/tests/test_util.py
@@ -18,8 +18,12 @@ def setUp(self):
     def tearDown(self):
         pass
 
-    def test_read_price_data(self):
+    def assert_dict_of_frames(self, dict1, dict2):
+        self.assertEquals(dict1.keys(), dict2.keys())
+        for key in dict1:
+            assert_frame_equal(dict1[key], dict2[key])
 
+    def test_read_price_data(self):
         # using default name_func in read_price_data()
         df = util.read_price_data(self.prices)
         dt1 = TS("2014-09-30")
@@ -581,3 +585,89 @@ def test_weighted_expiration_two_generics(self):
                                        TS('2015-01-05')],
                                 columns=["CL1", "CL2"])
         assert_frame_equal(wexp, exp_wexp)
+
+    def test_flatten(self):
+        vals = [[1, 0], [0, 1], [1, 0], [0, 1]]
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'),
+                                          (TS('2015-01-03'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLH5')])
+        weights = pd.DataFrame(vals, index=widx, columns=["CL1", "CL2"])
+        flat_wts = util.flatten(weights)
+
+        flat_wts_exp = pd.DataFrame(
+            {"date": [TS('2015-01-03')] * 4 + [TS('2015-01-04')] * 4,
+             "contract": ['CLF5'] * 2 + ['CLG5'] * 4 + ['CLH5'] * 2,
+             "generic": ["CL1", "CL2"] * 4,
+             "weight": [1, 0, 0, 1, 1, 0, 0, 1]}
+        ).loc[:, ["date", "contract", "generic", "weight"]]
+        assert_frame_equal(flat_wts, flat_wts_exp)
+
+    def test_flatten_dict(self):
+        vals = [[1, 0], [0, 1], [1, 0], [0, 1]]
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'),
+                                          (TS('2015-01-03'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLH5')])
+        weights1 = pd.DataFrame(vals, index=widx, columns=["CL1", "CL2"])
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'COF5')])
+        weights2 = pd.DataFrame(1, index=widx, columns=["CO1"])
+        weights = {"CL": weights1, "CO": weights2}
+        flat_wts = util.flatten(weights)
+        flat_wts_exp = pd.DataFrame(
+            {"date": ([TS('2015-01-03')] * 4 + [TS('2015-01-04')] * 4
+                      + [TS('2015-01-03')]),
+             "contract": (['CLF5'] * 2 + ['CLG5'] * 4 + ['CLH5'] * 2
+                          + ["COF5"]),
+             "generic": ["CL1", "CL2"] * 4 + ["CO1"],
+             "weight": [1, 0, 0, 1, 1, 0, 0, 1, 1],
+             "key": ["CL"] * 8 + ["CO"]}
+        ).loc[:, ["date", "contract", "generic", "weight", "key"]]
+        assert_frame_equal(flat_wts, flat_wts_exp)
+
+    def test_unflatten(self):
+        flat_wts = pd.DataFrame(
+            {"date": [TS('2015-01-03')] * 4 + [TS('2015-01-04')] * 4,
+             "contract": ['CLF5'] * 2 + ['CLG5'] * 4 + ['CLH5'] * 2,
+             "generic": ["CL1", "CL2"] * 4,
+             "weight": [1, 0, 0, 1, 1, 0, 0, 1]}
+        ).loc[:, ["date", "contract", "generic", "weight"]]
+        wts = util.unflatten(flat_wts)
+
+        vals = [[1, 0], [0, 1], [1, 0], [0, 1]]
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'),
+                                          (TS('2015-01-03'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLH5')],
+                                         names=("date", "contract"))
+        cols = pd.Index(["CL1", "CL2"], name="generic")
+        wts_exp = pd.DataFrame(vals, index=widx, columns=cols)
+        assert_frame_equal(wts, wts_exp)
+
+    def test_unflatten_dict(self):
+        flat_wts = pd.DataFrame(
+            {"date": ([TS('2015-01-03')] * 4 + [TS('2015-01-04')] * 4
+                      + [TS('2015-01-03')]),
+             "contract": (['CLF5'] * 2 + ['CLG5'] * 4 + ['CLH5'] * 2
+                          + ["COF5"]),
+             "generic": ["CL1", "CL2"] * 4 + ["CO1"],
+             "weight": [1, 0, 0, 1, 1, 0, 0, 1, 1],
+             "key": ["CL"] * 8 + ["CO"]}
+        ).loc[:, ["date", "contract", "generic", "weight", "key"]]
+        wts = util.unflatten(flat_wts)
+
+        vals = [[1, 0], [0, 1], [1, 0], [0, 1]]
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'CLF5'),
+                                          (TS('2015-01-03'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLG5'),
+                                          (TS('2015-01-04'), 'CLH5')],
+                                         names=("date", "contract"))
+        cols = pd.Index(["CL1", "CL2"], name="generic")
+        weights1 = pd.DataFrame(vals, index=widx, columns=cols)
+        widx = pd.MultiIndex.from_tuples([(TS('2015-01-03'), 'COF5')],
+                                         names=("date", "contract"))
+        cols = pd.Index(["CO1"], name="generic")
+        weights2 = pd.DataFrame(1, index=widx, columns=cols)
+        wts_exp = {"CL": weights1, "CO": weights2}
+
+        self.assert_dict_of_frames(wts, wts_exp)
diff --git a/mapping/util.py b/mapping/util.py
@@ -40,6 +40,104 @@ def name_func(x):
     return pd.concat(dfs, axis=0).sort_index()
 
 
+def flatten(weights):
+    """
+    Flatten weights into a long DataFrame.
+
+    Parameters
+    ----------
+    weights: pandas.DataFrame or dict
+        A DataFrame of instrument weights with a MultiIndex where the top level
+        contains pandas. Timestamps and the second level is instrument names.
+        The columns consist of generic names. If dict is given this should be
+        a dict of pandas.DataFrame in the above format, with keys for different
+        root generics, e.g. 'CL'
+
+    Returns
+    -------
+    A long DataFrame of weights, where columns are "date", "contract",
+    "generic" and "weight". If a dictionary is passed, DataFrame will contain
+    additional colum "key" containing the key value and be sorted according to
+    this key value.
+
+    Example
+    -------
+    >>> vals = [[1, 0], [0, 1], [1, 0], [0, 1]]
+    >>> widx = pd.MultiIndex.from_tuples([(pd.Timestamp('2015-01-03'), 'CLF5'),
+    ...                                   (pd.Timestamp('2015-01-03'), 'CLG5'),
+    ...                                   (pd.Timestamp('2015-01-04'), 'CLG5'),
+    ...                                   (pd.Timestamp('2015-01-04'), 'CLH5')])
+    >>> weights = pd.DataFrame(vals, index=widx, columns=["CL1", "CL2"])
+    >>> util.flatten(weights)
+    """  # NOQA
+    if isinstance(weights, pd.DataFrame):
+        wts = weights.stack().reset_index()
+        wts.columns = ["date", "contract", "generic", "weight"]
+    elif isinstance(weights, dict):
+        wts = []
+        for key in sorted(weights.keys()):
+            wt = weights[key].stack().reset_index()
+            wt.columns = ["date", "contract", "generic", "weight"]
+            wt.loc[:, "key"] = key
+            wts.append(wt)
+        wts = pd.concat(wts, axis=0).reset_index(drop=True)
+    else:
+        ValueError("weights must be pd.DataFrame or dict")
+
+    return wts
+
+
+def unflatten(flat_weights):
+    """
+    Pivot weights from long DataFrame into weighting matrix.
+
+    Parameters
+    ----------
+    flat_weights: pandas.DataFrame
+        A long DataFrame of weights, where columns are "date", "contract",
+        "generic", "weight" and optionally "key". If "key" column is
+        present a dictionary of unflattened DataFrames is returned with the
+        dictionary keys corresponding to the "key" column and each sub
+        DataFrame containing rows for this key.
+
+    Returns
+    -------
+    A DataFrame or dict of DataFrames of instrument weights with a MultiIndex
+    where the top level contains pandas.Timestamps and the second level is
+    instrument names. The columns consist of generic names. If dict is returned
+    the dict keys correspond to the "key" column of the input.
+
+    Example
+    -------
+    >>> long_wts = pd.DataFrame(
+    ...         {"date": [TS('2015-01-03')] * 4 + [TS('2015-01-04')] * 4,
+    ...          "contract": ['CLF5'] * 2 + ['CLG5'] * 4 + ['CLH5'] * 2,
+    ...          "generic": ["CL1", "CL2"] * 4,
+    ...          "weight": [1, 0, 0, 1, 1, 0, 0, 1]}
+    ...     ).loc[:, ["date", "contract", "generic", "weight"]]
+    >>> util.unflatten(long_wts)
+
+    See also: calc_rets()
+    """  # NOQA
+    if flat_weights.columns.contains("key"):
+        weights = {}
+        for key in flat_weights.loc[:, "key"].unique():
+            flt_wts = flat_weights.loc[flat_weights.loc[:, "key"] == key, :]
+            flt_wts = flt_wts.drop(labels="key", axis=1)
+            wts = flt_wts.pivot_table(index=["date", "contract"],
+                                      columns=["generic"],
+                                      values=["weight"])
+            wts.columns = wts.columns.droplevel(0)
+            weights[key] = wts
+    else:
+        weights = flat_weights.pivot_table(index=["date", "contract"],
+                                           columns=["generic"],
+                                           values=["weight"])
+        weights.columns = weights.columns.droplevel(0)
+
+    return weights
+
+
 def calc_rets(returns, weights):
     """
     Calculate continuous return series for futures instruments. These consist