In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os

In [64]:
folder_path = '/Users/matteoferrazzi/Documents/GitHub/ML-project-2/Data/monthly_data'

files = os.listdir(folder_path)

In [65]:
datasets = []

for file in files:
    if file == '.DS_Store':
        continue
    df = pd.read_csv(folder_path + '/' + file, encoding='utf-8')
    df['name'] = file
    datasets.append(df)

datasets.sort(key=lambda x: x['name'][0])

In [66]:
macro = pd.read_csv('Data/macro_data_amit_goyal.csv', encoding='utf-8')
macro = macro[macro['yyyymm']>199000]

In [67]:
def standardize(df):
    return (df - df.mean()) / df.std()

In [68]:
def fill_missing(df):
    return df.fillna(0)

In [69]:
data = []
ret = []

for i,df in enumerate(datasets):
    
    df['mcap'] = df['SHROUT'] * df['prc']
    df.drop(['permno', 'DATE', 'Unnamed: 0', 'mve0', 'prc', 'SHROUT', 'sic2', 'name'], axis=1, inplace=True)
    df.dropna(thresh=60, axis=0, inplace=True)
    df = df[df['RET'] > -1]
    df = df.sort_values(by=['mcap'], ascending=False)
    df.drop(['mcap'], axis=1, inplace=True)
    df = df.head(100)
    ret.append(df['RET']-macro['Rfree'].values[i])
    df = df.drop(['RET'], axis=1)
    df = standardize(df)
    df = fill_missing(df)
    data.append(df)

In [70]:
def solve_f(ret, data, gamma, idx):
    # risolve per la singola f poi dobbiamo metterle in una lista
    return np.linalg.solve(gamma.T@data[idx-1].values.T@data[idx-1].values@gamma, gamma.T@data[idx-1].values.T@ret[idx].values)

In [71]:
def solve_gamma(ret, data, f):
    # f viene passato come lista
    A = np.sum([np.kron(data[i].values.T@data[i].values, f[i].reshape(-1,1)@f[i].reshape(1,-1)) for i in range(len(data)-1)], axis=0)
    B = np.sum([np.kron(data[i].values,f[i].reshape((1,-1))).T@ret[i+1] for i in range(len(data)-1)], axis=0)
    vec_gamma = np.linalg.solve(A, B)
    return vec_gamma.reshape((94, len(f[0])))

In [72]:
gamma = np.random.rand(94,5)
for i in range(100):
    f_list = []
    for i in range(len(data)-1):
        f_list.append(solve_f(ret, data, gamma, i+1))
    gamma = solve_gamma(ret, data, f_list)
    

In [73]:
def solve_f_reg(ret, data, gamma, idx, lambda_):
    # risolve per la singola f poi dobbiamo metterle in una lista
    return np.linalg.solve(gamma.T@data[idx-1].values.T@data[idx-1].values@gamma + lambda_*np.eye(gamma.shape[1]), gamma.T@data[idx-1].values.T@ret[idx].values)

In [74]:
def solve_gamma_reg(ret, data, f, lambda_):
    # f viene passato come lista
    A = np.sum([np.kron(data[i].values.T@data[i].values, f[i].reshape(-1,1)@f[i].reshape(1,-1)) for i in range(len(data)-1)], axis=0) + lambda_*np.eye(gamma.shape[0]*gamma.shape[1])
    B = np.sum([np.kron(data[i].values,f[i].reshape((1,-1))).T@ret[i+1] for i in range(len(data)-1)], axis=0)
    vec_gamma = np.linalg.solve(A, B)
    return vec_gamma.reshape((94, len(f[0])))

In [75]:
gamma_reg = np.random.rand(94,5)
lambda1 = 0.1
lambda2 = 0.1
for i in range(100):
    f_list_reg = []
    for i in range(len(data)-1):
        f_list_reg.append(solve_f_reg(ret, data, gamma_reg, i+1, lambda1))
    gamma_reg = solve_gamma_reg(ret, data, f_list_reg, lambda2)

In [76]:
def total_R_squared(ret, data, gamma, f_list):
    sum = 0
    ret_2 = 0
    l = len(ret[0])
    for t in range(len(f_list)):
        for i in range(l):
            sum += (ret[t+1].iloc[i] - data[t].iloc[i].values@gamma@f_list[t])**2
            ret_2 += ret[t+1].iloc[i]**2
    
    return 1 - sum/ret_2

In [77]:
print(total_R_squared(ret, data, gamma, f_list))

0.10402833913181775


In [78]:
print(total_R_squared(ret, data, gamma_reg, f_list_reg))

0.10355905078649752
