In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os

In [2]:
folder_path = "/Users/andreabelvisi/Documents/GitHub/ML-project-2/Data/monthly_data"
files = os.listdir(folder_path)

In [3]:
datasets = []

for file in files:
    if file == '.DS_Store':
        continue
    df = pd.read_csv(folder_path + '/' + file, encoding='utf-8')
    df['name'] = file
    datasets.append(df)

datasets.sort(key=lambda x: x['name'][0])

In [4]:
macro = pd.read_csv('Data/macro_data_amit_goyal.csv', encoding='utf-8')
macro = macro[macro['yyyymm']>199000]

In [5]:
def standardize(df):
    return (df - df.mean()) / df.std()

In [6]:
def fill_missing(df):
    return df.fillna(0)

In [7]:
data = []
ret = []

N = 100

for i,df in enumerate(datasets):
    
    df['mcap'] = df['SHROUT'] * df['prc']
    df.drop(['permno', 'DATE', 'Unnamed: 0', 'mve0', 'prc', 'SHROUT', 'sic2', 'name'], axis=1, inplace=True)
    df.dropna(thresh=60, axis=0, inplace=True)
    df = df[df['RET'] > -1]
    df = df.sort_values(by=['mcap'], ascending=False)
    df.drop(['mcap'], axis=1, inplace=True)
    df = df.head(N)
    ret.append(df['RET']-macro['Rfree'].values[i])
    df = df.drop(['RET'], axis=1)
    df = standardize(df)
    df = fill_missing(df)
    data.append(df)

T = len(data)

In [8]:
def solve_f(ret, data, gamma, idx):
    # risolve per la singola f poi dobbiamo metterle in una lista
    return np.linalg.solve(gamma.T@data[idx-1].values.T@data[idx-1].values@gamma, gamma.T@data[idx-1].values.T@ret[idx].values)

In [9]:
def solve_gamma(ret, data, f):
    # f viene passato come lista
    A = np.sum([np.kron(data[i].values.T@data[i].values, f[i].reshape(-1,1)@f[i].reshape(1,-1)) for i in range(len(data)-1)], axis=0)
    B = np.sum([np.kron(data[i].values,f[i].reshape((1,-1))).T@ret[i+1] for i in range(len(data)-1)], axis=0)
    vec_gamma = np.linalg.solve(A, B)
    return vec_gamma.reshape((94, len(f[0])))

Parameter initialization

In [10]:
x = []
for t in range(len(ret)-1):
    x.append(data[t].values.T@ret[t+1].values/len(ret[0]))

x_cov = np.sum([x[i].reshape((-1,1))@x[i].reshape((1,-1)) for i in range(len(x))], axis = 0)
eigval_x, eigvec_x = np.linalg.eig(x_cov)

idx = np.argsort(np.abs(eigval_x))[::-1]
sort_eigvec_x = eigvec_x[:,idx]
k = 5
gamma = sort_eigvec_x[:,:k]
gamma_reg = sort_eigvec_x[:,:k]
gamma_reg_w = sort_eigvec_x[:,:k]

Regression no weights no regularization

In [11]:
first = False 
while True:

    temp = []
    f_list_new = []

    for i in range(len(data)-1):
        f = solve_f(ret, data, gamma, i+1)
        f_list_new.append(f)
        if first:
            f_change = f-f_list[i]
            temp.append(np.max(f_change))
    first = True
    f_list = f_list_new.copy()

    gamma_new = solve_gamma(ret, data, f_list)
    gamma_change = np.abs(gamma_new-gamma)
    temp.append(np.max(gamma_change))
    gamma = gamma_new.copy()
    if (max(temp)<=1e-3):
        break

In [15]:
#for i in range(1000):
#    f_list = []
#    for i in range(len(data)-1):
#        f_list.append(solve_f(ret, data, gamma, i+1))
#    gamma = solve_gamma(ret, data, f_list)
    

regression with regularization, no weights

In [16]:
def solve_f_reg(ret, data, gamma, idx, lambda_):
    # risolve per la singola f poi dobbiamo metterle in una lista
    return np.linalg.solve(gamma.T@data[idx-1].values.T@data[idx-1].values@gamma + lambda_*np.eye(gamma.shape[1]), gamma.T@data[idx-1].values.T@ret[idx].values)

In [17]:
def solve_gamma_reg(ret, data, f, lambda_):
    # f viene passato come lista
    A = np.sum([np.kron(data[i].values.T@data[i].values, f[i].reshape(-1,1)@f[i].reshape(1,-1)) for i in range(len(data)-1)], axis=0) + lambda_*np.eye(gamma.shape[0]*gamma.shape[1])
    B = np.sum([np.kron(data[i].values,f[i].reshape((1,-1))).T@ret[i+1] for i in range(len(data)-1)], axis=0)
    vec_gamma = np.linalg.solve(A, B)
    return vec_gamma.reshape((94, len(f[0])))

In [18]:
lambda1 = 0.1
lambda2 = 0.1
first = False 
while True:

    temp = []
    f_list_new = []

    for i in range(len(data)-1):
        f = solve_f_reg(ret, data, gamma_reg, i+1, lambda1)
        f_list_new.append(f)
        if first:
            f_change = f-f_list_reg[i]
            temp.append(np.max(f_change))
    first = True
    f_list_reg = f_list_new.copy()

    gamma_new = solve_gamma_reg(ret, data, f_list_reg, lambda2)
    gamma_change = np.abs(gamma_new-gamma_reg)
    temp.append(np.max(gamma_change))
    gamma_reg = gamma_new.copy()
    if (max(temp)<=1e-3):
        break

In [19]:
#for i in range(100):
#    f_list_reg = []
#    for i in range(len(data)-1):
#        f_list_reg.append(solve_f_reg(ret, data, gamma_reg, i+1, lambda1))
#    gamma_reg = solve_gamma_reg(ret, data, f_list_reg, lambda2)

regression with regularization and weights

In [20]:
W = np.eye(N)

W_list = [W]*(len(data)-1)

In [21]:
def solve_f_reg_w(ret, data, gamma, idx,lambda_, W ):
    # risolve per la singola f poi dobbiamo metterle in una lista
    return np.linalg.solve(gamma.T@data[idx-1].values.T@W@data[idx-1].values@gamma + lambda_*np.eye(gamma.shape[1]), 
                           gamma.T@data[idx-1].values.T@W@ret[idx].values)

In [22]:
def solve_gamma_reg_w(ret, data, f,lambda_, W):
    # f viene passato come lista
    A = np.sum([np.kron(data[i].values.T@W[i]@data[i].values, f[i].reshape(-1,1)@f[i].reshape(1,-1)) for i in range(len(data)-1)], 
               axis=0) + lambda_*np.eye(gamma.shape[0]*gamma.shape[1])
    
    B = np.sum([np.kron(np.sqrt(W[i])@data[i].values,f[i].reshape((1,-1))).T@np.sqrt(W[i])@ret[i+1] for i in range(len(data)-1)], axis=0)

    vec_gamma = np.linalg.solve(A, B)
    return vec_gamma.reshape((94, len(f[0])))

In [23]:
lambda1 = 0.1
lambda2 = 0.1
first = False 
while True:

    temp = []
    f_list_new = []

    for i in range(len(data)-1):
        f = solve_f_reg_w(ret, data, gamma_reg_w, i+1, lambda1, W_list[i])
        f_list_new.append(f)
        if first:
            f_change = f-f_list_reg[i]
            temp.append(np.max(f_change))
    first = True
    f_list_reg_w = f_list_new.copy()

    gamma_new = solve_gamma_reg_w(ret, data, f_list_reg_w, lambda2, W_list)
    gamma_change = np.abs(gamma_new-gamma_reg_w)
    temp.append(np.max(gamma_change))
    gamma_reg_w = gamma_new.copy()
    if (max(temp)<=1e-3):
        break

valuation metrics

In [24]:
def total_R_squared(ret, data, gamma, f_list):
    sum = 0
    ret_2 = 0
    l = len(ret[0])
    for t in range(T-1):
        for i in range(l):
            sum += (ret[t+1].iloc[i] - data[t].iloc[i].values@gamma@f_list[t])**2
            ret_2 += ret[t+1].iloc[i]**2
    
    return 1 - sum/ret_2

In [25]:
def pred_R_squared(ret, data, gamma, f_list):

    lambda_t = np.mean(np.array(f_list), axis = 0)
    sum = 0
    ret_2 = 0
    l = len(ret[0])
    for t in range(T-1):
        for i in range(l):
            sum += (ret[t+1].iloc[i] - data[t].iloc[i].values@gamma@lambda_t)**2
            ret_2 += ret[t+1].iloc[i]**2
    
    return 1 - sum/ret_2

In [26]:
print(total_R_squared(ret, data, gamma, f_list))

0.1052400899951863


In [27]:
print(total_R_squared(ret, data, gamma_reg, f_list_reg))

0.10418398722060174


In [28]:
print(total_R_squared(ret, data, gamma_reg_w, f_list_reg_w))

0.10418398722060251


In [29]:
print(pred_R_squared(ret, data, gamma, f_list))

-0.0019198165602634099


In [30]:
print(pred_R_squared(ret, data, gamma_reg, f_list_reg))

-0.0009789326297886536


In [31]:
print(pred_R_squared(ret, data, gamma_reg_w,f_list_reg_w))

-0.0009789326297888756


## Kernel regressions:

In [30]:
def kernel(x,xt, tk=0, alpha=1, l=1, gamma=1):

    norm=np.linalg.norm(x-xt)**2

    #Linear
    if tk==0:
        return x@xt.T
    
    #Gaussian
    if tk==1:
        return np.exp(-norm/(2*l**2))
    
    #Rational Quadratic
    if tk==2:
        return (1+norm/(2*alpha*l**2))**(-alpha)
    
    #Gaussian Exponential
    if tk==3:
        return np.exp(-norm**(gamma/2)/(l**gamma))

In [None]:
def solve_v_kernel (ret, lambda1, data, f):
    #f is a list
    #compute Q matrix
    A=np.array(f)@np.array(f).T
    Q=np.zeros_like(A)

    T=len(data)
    for t in range(0,T-1):
        for s in range(0,T-1):
            Q[t,s]=A[t,s]

In [43]:
T=len(data)
A=np.array(f_list)@np.array(f_list).T
Q=np.zeros([(T-1)*N, (T-1)*N])

for t in range(0,T-1):
    for s in range(0,T-1):
        Q[t*100:(t+1)*100,s*100:(s+1)*100]=A[t,s]*kernel(np.array(data[t].values), np.array(data[s].values))

In [44]:
Omega=np.eye((T-1)*N, (T-1)*N)

In [45]:
lambda1=1
R=np.array(ret).flatten()

: 

In [42]:
np.linalg.solve(Q+lambda1*Omega, R)

(37200,)

In [31]:
kernel(np.array(data[1].values), np.array(data[1].values)).shape

(100, 100)

In [27]:
np.array(data[1].values).shape

(100, 94)

In [29]:
np.array(data[1].values).shape

(100, 94)