In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import os
import importlib
import download_clean_data as dc
import ipca
import metrics
import kernel_regression as kr
importlib.reload(dc) 
importlib.reload(ipca)
importlib.reload(metrics)
importlib.reload(kr)

<module 'kernel_regression' from '/Users/matteoferrazzi/Documents/GitHub/ML-project-2/kernel_regression.py'>

In [2]:
folder_path = "/Users/matteoferrazzi/Documents/GitHub/ML-project-2/Data/monthly_data"

In [3]:
N = 100
date = 20150000
data, ret = dc.download_clean_data(folder_path, date, N)

In [4]:
x = []
for t in range(len(ret)-1):
    x.append(data[t].values.T@ret[t+1].values/len(ret[0]))

x_cov = np.sum([x[i].reshape((-1,1))@x[i].reshape((1,-1)) for i in range(len(x))], axis = 0)
eigval_x, eigvec_x = np.linalg.eig(x_cov)

idx = np.argsort(np.abs(eigval_x))[::-1]
sort_eigvec_x = eigvec_x[:,idx].real
k = 5
gamma_first = sort_eigvec_x[:,:k]

In [5]:
max_iter = 100
gamma, f_list = ipca.ipca(data, ret, gamma_first.copy(), max_iter)
W = np.eye(N)
W_list = [W]*(len(data)-1)
lambda1 = 0.1
lambda2 = 0.1
gamma_reg_w, f_list_reg_w = ipca.ipca_reg_w(data, ret, gamma_first.copy(), max_iter, lambda1, lambda2, W_list)

In [31]:
print(metrics.total_R_squared(ret, data, gamma, f_list),metrics.pred_R_squared(ret, data, gamma, f_list))

0.15335737000068583 -0.004556714482517377


In [32]:
print(metrics.total_R_squared(ret, data, gamma_reg_w, f_list_reg_w),metrics.pred_R_squared(ret, data, gamma_reg_w, f_list_reg_w))

0.15167256959390363 -0.0028241664513473097


In [6]:
def split_dataset(x,y, trsh):

    n = int(np.floor(len(y)*trsh))

    x_train=x[:n]
    y_train=y[:n]
    x_test=x[n:]
    y_test=y[n:]
        
    return x_train,y_train,x_test,y_test

In [7]:
def cross_val_IPCA(y,x, trsh, gamma_first, max_iter):

    total_R2_dict = {}
    pred_R2_dict = {}
    
    xx_train,yy_train,xx_test,yy_test = split_dataset(x,y, trsh)

    gamma, _ = ipca.ipca(xx_train, yy_train, gamma_first.copy(), max_iter)
    print('done')

    yy_pred = []

    for i in range(len(xx_test)-1):

        f = ipca.solve_f(yy_test, xx_test, gamma, i+1)
        yy_pred.append(f)

    total_R2_dict[('IPCA')] = metrics.total_R_squared(yy_test, xx_test, gamma, yy_pred)
    pred_R2_dict[('IPCA')] = metrics.pred_R_squared(yy_test, xx_test, gamma, yy_pred)

    return total_R2_dict, pred_R2_dict

In [8]:
def cross_val_IPCA_reg(y,x, trsh, lambda1_v, lambda2_v, gamma_first, max_iter):

    total_R2_dict = {}
    pred_R2_dict = {}
    
    xx_train,yy_train,xx_test,yy_test = split_dataset(x,y, trsh)

    for lambda1 in lambda1_v:
        for lambda2 in lambda2_v:

            gamma_reg_w, _ = ipca.ipca_reg_w(xx_train, yy_train, gamma_first.copy(), max_iter, lambda1, lambda2, W_list)

            yy_pred = []

            for i in range(len(xx_test)-1):

                f = ipca.solve_f_reg_w(yy_test, xx_test, gamma_reg_w, i+1, lambda1, W_list[i])
                yy_pred.append(f)

            total_R2_dict[('IPCA_reg', lambda1, lambda2)] = metrics.total_R_squared(yy_test, xx_test, gamma_reg_w, yy_pred)
            pred_R2_dict[('IPCA_reg', lambda1, lambda2)] = metrics.pred_R_squared(yy_test, xx_test, gamma_reg_w, yy_pred)

    return total_R2_dict, pred_R2_dict

In [9]:
y = ret
x = data
trsh = 0.8
lambda1_v = list([0.1])
lambda2_v = list([0.1])

cross_val_IPCA_reg(y,x, trsh, lambda1_v, lambda2_v, gamma_first, max_iter)

({('IPCA_reg', 0.1, 0.1): 0.024884449223020466},
 {('IPCA_reg', 0.1, 0.1): 0.0005109145409363247})

In [11]:
cross_val_IPCA(y,x, 0.9, gamma_first, max_iter)

done


({'IPCA': 0.029925434516756555}, {'IPCA': -0.21153091124492174})

In [10]:
data2 = data.copy()
data2 = np.array(np.array(data2).reshape(72*100,94)) #flatten data, build X
tk = 1
l = 1
K = kr.K_LR(data2, tk, l)

In [11]:
Omega2=np.eye(N)

In [54]:
Omega1=np.eye((len(data)-1)*N)
Omega2=np.eye(N)
f_list_kr, v_kr, Q_kr = kr.kernel_regression(data, ret, f_list.copy(), lambda1, lambda2, Omega1, Omega2, 10, N, K)

0
1
2
3
4
5
6
7
8
9


In [55]:
print(metrics.total_R_squared_kr(ret, v_kr, Q_kr))

0.14609966865070023


In [44]:
m_hat = 10
L, B = kr.pivoted_chol(K, m_hat)

1
2
3
4
5
6
7
8
9


In [49]:
f_list_kr_LR, v_kr_LR = kr.kernel_regression_LR(data, K, B, ret, f_list.copy(), lambda1, lambda2, Omega2, 10, m_hat, N)

0
1
2
3
4
5
6
7
8
9


In [50]:
print(metrics.total_R_squared_kr_LR(ret, B, K, v_kr_LR, f_list_kr_LR))

0.0017402896001241785


In [24]:
x_train,y_train,x_test,y_test = split_dataset(data, ret, 0.8)

In [17]:
def cross_val_gaussian(y,x, trsh, lambda1_v, lambda2_v, alphas_v, N, f_list_input, Omega2, max_iter):

    total_R2_dict = {}
    
    xx_train,yy_train,xx_test,yy_test = split_dataset(x,y, trsh)
    Omega_test = np.eye((len(xx_test)-1)*N)
    Omega_train = np.eye((len(xx_train)-1)*N)

    for alpha in alphas_v:

        data2_train = xx_train.copy()
        data2_train = np.array(np.array(data2_train).reshape(len(xx_train)*N,94)) #flatten data, build X
        data2_test = xx_test.copy()
        data2_test = np.array(np.array(data2_test).reshape(len(xx_test)*N,94))
        K_train = kr.K_LR(data2_train, 1, alpha)
        K_test = kr.K_LR(data2_test, 1, alpha)

        for lambda1 in lambda1_v:
            for lambda2 in lambda2_v:
                f_list, Q, v, g, G = kr.kernel_regression(xx_train, yy_train, f_list_input.copy(), lambda1, lambda2, Omega_train, Omega2, max_iter, N, K_train)

                yy_pred = []
                for t in range(0,len(xx_test)-1):

                    c = np.linalg.solve(G+lambda2*Omega2, yy_test[t+1])
                    print('done1')
                    yy_pred.append(g.T@c)

                print('done')

                vt, Qt = kr.solve_v_kernel(yy_test, lambda1, xx_test, yy_pred, N, Omega_test, K_test)

                total_R2_dict[('Gaussian', lambda1, lambda2, alpha)] = metrics.total_R_squared_kr(yy_test, Qt, vt)
                
    return total_R2_dict

In [18]:
alphas_v = list([0.1])
cross_val_gaussian(y,x, 0.8, lambda1_v, lambda2_v, alphas_v, N, f_list, Omega2, 10)

0
1
2
3
4
5
6
7
8
9
done1
done1
done1
done1
done1
done1
done1
done1
done1
done1
done1
done1
done1
done1
done


{('Gaussian', 0.1, 0.1, 0.1): 0.5639048057945424}

In [None]:
def cross_val_kernel_LR(y,x, trsh, lambda1_v, lambda2_v, alphas_v, m_hat):

    L, B = kr.pivoted_chol(K, m_hat)

    total_R2_dict = {}
    
    xx_train,yy_train,xx_test,yy_test = split_dataset(x,y, trsh)

    for alpha in alphas_v:

        data2 = xx_train.copy()
        data2 = np.array(np.array(data2).reshape(72*100,94)) #flatten data, build X
        K = kr.K_LR(data2, 1, alpha)

        for lambda1 in lambda1_v:
            for lambda2 in lambda2_v:
                
            
                
                



                
    return total_R2_dict