# Cleaning Return Series for "Death of Factors"

The goal of this notebook is to go through all of the CSV files to compile a long dataset of returns. The dataset should be structured as

Metadata Columns Describing Strategies - Monthly Date - Monthly percentage return

The file Data/Index.csv contains a listing of all the factors that we're studying. There's a combination of time series and cross sectional factors, as well as across different asset classes. Most of the data comes from the AQR data library

## Cross Sectional Value + Momentum

In [11]:
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import numpy as np
from beakerx import *
from beakerx.object import beakerx
import seaborn as sns
import matplotlib.pyplot as plt

def drop_missing(self, var_name, inplace = True):
    return self.loc[~pd.isnull(self[var_name]), ] 
pd.DataFrame.drop_missing = drop_missing

In [12]:
aqr_mom_value = pd.read_csv('../Data/CSV/aqr_val_momentum.csv', sep = ',')
aqr_mom_value = aqr_mom_value.drop(['VAL', 'MOM', 'VAL^SS', 'MOM^SS', 'VAL^AA', 'MOM^AA'], axis = 1)\
                             .rename(index = str, columns = {'DATE': 'Month_date'})
aqr_mom_value['Month_date'] = pd.to_datetime(aqr_mom_value.Month_date, format = '%m/%d/%Y') + MonthEnd(0)

In [13]:
# Melt down
aqr_mom_value_melt = aqr_mom_value.melt(id_vars = 'Month_date')
aqr_mom_value_melt['value'] = pd.to_numeric(aqr_mom_value_melt['value'].str.replace('%', '')) / 100
aqr_mom_value_melt = aqr_mom_value_melt.drop_missing('value')
var_names = aqr_mom_value_melt['variable'].str.split('_', n = 2, expand = True)
aqr_mom_value_melt['Descriptor'] = var_names[0]
aqr_mom_value_melt['Country_Asset'] = var_names[2]

# Convert to get the asset class names
conversion_dict = {'Country_Asset': ['US90', 'UK90', 'ROE90', 'JP90', 'EQ', 'FX', 'FI', 'COM'],
                   'Country': ['USA', 'GBR', 'EUR', 'JPN', 'WLD', 'WLD', 'WLD', 'WLD'],
                   'Asset Class': ['Equities', 'Equities', 'Equities', 'Equities', 'Equities', 'FX', 'Bonds', 'Cmd']}
conversion_frame = pd.DataFrame.from_dict(conversion_dict)

# Join to get the names
aqr_mom_value_melt.set_index('Country_Asset', inplace = True)
conversion_frame.set_index('Country_Asset', inplace = True)
aqr_mom_value_melt = aqr_mom_value_melt.join(conversion_frame)

# Rename things
aqr_mom_value_melt['Descriptor'] = ['Value' if x == 'VALLS' else ('Momentum' if x == 'MOMLS' else np.nan) for x in aqr_mom_value_melt.Descriptor]
aqr_mom_value_melt.rename(index = str, columns = {'value': 'Month_ret'}, inplace = True)
aqr_mom_value_melt = aqr_mom_value_melt.reset_index().drop(['Country_Asset','variable'], axis = 1)

In [14]:
def check_data(df):
    assert(sorted(df.columns.tolist()) == ['Asset Class', 'Country', 'Descriptor', 'Month_date', 'Month_ret'])
    assert(df.dtypes['Month_date'].str == '<M8[ns]')
    assert(df.reset_index().loc[df.reset_index().duplicated(['Asset Class', 'Country', 'Descriptor', 'Month_date'])].shape[0] == 0)
check_data(aqr_mom_value_melt)

In [15]:
aqr_mom_value_melt.set_index(['Asset Class', 'Descriptor', 'Country'], inplace = True)

## Quality

In [16]:
aqr_qual = pd.read_csv('../Data/CSV/aqr_quality.csv', sep = ',')
aqr_qual['Month_date'] = pd.to_datetime(aqr_qual['DATE'], format = '%m/%d/%Y') + MonthEnd(0)
aqr_qual_melt = aqr_qual.drop(['DATE'], axis = 1).melt(id_vars = 'Month_date')
aqr_qual_melt['value'] = pd.to_numeric(aqr_qual_melt['value'].str.replace('%', '')) / 100
aqr_qual_melt = aqr_qual_melt.drop_missing('value')

# Make some new variables
aqr_qual_melt = aqr_qual_melt.reset_index()
aqr_qual_melt.rename(index = str, columns = {'variable': 'Country', 'value': 'Month_ret'}, inplace = True)
aqr_qual_melt['Asset Class'] = 'Equities'
aqr_qual_melt['Descriptor'] = 'Quality'

# Index
aqr_qual_melt = aqr_qual_melt.drop(['index'], axis = 1)
check_data(aqr_qual_melt)
aqr_qual_melt = aqr_qual_melt.set_index(['Asset Class', 'Descriptor', 'Country'])

In [17]:
aqr_qual_melt

## Time Series Momentum

In [18]:
aqr_ts = pd.read_csv('../Data/CSV/aqr_ts_momentum.csv', sep = ',')
aqr_ts = aqr_ts.rename(index = str, columns = {'Date': 'Month_date'})
aqr_ts_melt = aqr_ts.drop(['TSMOM'], axis = 1).melt(id_vars = 'Month_date')
var_names = aqr_ts_melt['variable'].str.split('^', n = 1, expand = True)

asset_class_dict = {'CM': 'Cmd',
                    'EQ': 'Equities',
                    'FI': 'Bonds',
                    'FX': 'FX'}

aqr_ts_melt['Descriptor'] = 'TS Momentum'
aqr_ts_melt['Asset Class'] = [asset_class_dict[x] for x in var_names.iloc[:, 1]]
aqr_ts_melt['Country'] = 'WLD'
aqr_ts_melt['value'] = pd.to_numeric(aqr_ts_melt['value'].str.replace('%', '')) / 100
aqr_ts_melt.drop(['variable'], axis = 1, inplace = True)
aqr_ts_melt.rename(index = str, columns = {'value': 'Month_ret'}, inplace = True)
aqr_ts_melt['Month_date'] = pd.to_datetime(aqr_ts_melt['Month_date'], format = '%m/%d/%Y') + MonthEnd(0)

check_data(aqr_ts_melt)

In [19]:
aqr_ts_melt = aqr_ts_melt.set_index(['Asset Class', 'Descriptor', 'Country'])

In [20]:
aqr_ts_melt

## Combine all the data

In [21]:
combined_factors = pd.concat([aqr_mom_value_melt, aqr_ts_melt, aqr_qual_melt], sort = False)
store = pd.HDFStore('../Data/data.h5')
store.put('all_factors', combined_factors)
store.close()