# Libraries

In [None]:
import pandas as pd
from pathlib import Path
import os
data_path = str(Path(os.getcwd()).parent.absolute())+"/data"
figures_path = str(Path(os.getcwd()).parent.absolute())+"/reports/figures"

# Load Data

In [None]:
#Import data as Pandas df
#factor data from Feng et al. (2020) "Taming the Factor Zoo" from 7/30/1976 to 12/29/2017
factors = pd.read_csv(data_path+"/raw/factors.csv", index_col=0)

#portfolio data 3×2 bivariate-sorted portfolios
portfolio_returns = pd.read_csv(data_path+"/raw/port_3x2.csv", header=None, index_col=0)

In [None]:
#Rename/Name the portfolios
portfolio_returns = portfolio_returns.add_prefix('portfolio_')

# Clean/prepare the data

In [None]:
if len(portfolio_returns)==len(factors):
    factors.set_index(pd.to_datetime(factors.index,format= "%Y%m%d"), inplace=True)
    portfolio_returns.set_index(factors.index, inplace=True)
else:
    print("DATE RANGE ERRORS")

In [None]:
print("Dropping "+str(factors.isnull().sum().sum()) +" rows containing NaN values in factors")
#drop rows containing NaN values in factors
portfolio_returns.drop(factors[factors.isna().any(axis=1)].index, axis=0, inplace=True)
factors.drop(factors[factors.isna().any(axis=1)].index, axis=0, inplace=True)
print("Dropping "+str(portfolio_returns.isnull().sum().sum()) +" rows containing NaN values in portfolio_returns")
#drop rows containing NaN values in portfolio_returns
factors.drop(portfolio_returns[portfolio_returns.isna().any(axis=1)].index, axis=0, inplace=True)
portfolio_returns.drop(portfolio_returns[portfolio_returns.isna().any(axis=1)].index, axis=0, inplace=True)

In [None]:
#convert all monthly returns to monthly excess returns
portfolio_excess_returns = portfolio_returns.copy(deep=True)
for i in portfolio_excess_returns.columns:
    portfolio_excess_returns[i] -= factors['RF']

In [None]:
#Drop RF from factors as we don't need it anymore
factors.drop(["RF"], axis=1,inplace=True)

# Save the data

In [None]:
factors.to_csv(data_path+"/interim/factors.csv")
portfolio_excess_returns.to_csv(data_path+"/interim/portfolio_excess_returns.csv")