In [1]:
%load_ext autoreload
%autoreload 2

## M day normalized mean

$ R_t^{(M)} = \frac{1}{\sqrt{M}} \sum_{k=1}^{M} R_{t+1-k} $

We will first propose a strategy: 

$ S_{t+1} = \beta_{1} R_{t}^{(5)} + \beta_{2} R_{t-20}^{(230)} $

we will first fit beta with various models

In [2]:
import os 
if 'notebooks' in os.getcwd():
    os.chdir('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

from src.metrics.custom_metric_QRT_22_6BhdSkn import metric, \
    transform_submission_to_ypred

from src.metrics.benchmark import get_benchmark

In [3]:
A = np.zeros([250, 10])
A[0:5, 0] = 1/np.sqrt(5) # 5-day return factor
A[20:250, 1] = 1/np.sqrt(230) # momentum factor

In [4]:
A

array([[0.4472136 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.4472136 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.06593805, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06593805, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06593805, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [5]:
X_train = pd.read_csv('data/X_train_YG7NZSq.csv', index_col=0)
X_train.columns.name = 'date'

In [6]:
Y_train = pd.read_csv('data/Y_train_wz11VM6.csv', index_col=0)
Y_train.columns.name = 'date'

In [7]:
X_train_reshape = pd\
    .concat([ X_train.T.shift(i+1).stack(dropna=False) for i in range(250) ], 1)\
    .dropna()
X_train_reshape.columns = pd.Index(range(1,251), name='timeLag')

  X_train_reshape = pd\


In [8]:
first_company_mask = X_train_reshape\
    .index\
    .get_level_values("stocksID") == 0

X_train_reshape[first_company_mask]

Unnamed: 0_level_0,timeLag,1,2,3,4,5,6,7,8,9,10,...,241,242,243,244,245,246,247,248,249,250
date,stocksID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
250,0,0.000103,0.012387,0.011243,0.002595,-0.008509,-0.002711,0.008934,0.006571,-0.018546,-0.008353,...,0.009119,-0.008451,0.007120,-0.011745,0.009092,-0.005110,-0.016676,-0.010776,-0.013002,-0.018647
251,0,0.001128,0.000103,0.012387,0.011243,0.002595,-0.008509,-0.002711,0.008934,0.006571,-0.018546,...,0.002627,0.009119,-0.008451,0.007120,-0.011745,0.009092,-0.005110,-0.016676,-0.010776,-0.013002
252,0,-0.001046,0.001128,0.000103,0.012387,0.011243,0.002595,-0.008509,-0.002711,0.008934,0.006571,...,-0.016794,0.002627,0.009119,-0.008451,0.007120,-0.011745,0.009092,-0.005110,-0.016676,-0.010776
253,0,-0.007027,-0.001046,0.001128,0.000103,0.012387,0.011243,0.002595,-0.008509,-0.002711,0.008934,...,-0.008695,-0.016794,0.002627,0.009119,-0.008451,0.007120,-0.011745,0.009092,-0.005110,-0.016676
254,0,-0.009757,-0.007027,-0.001046,0.001128,0.000103,0.012387,0.011243,0.002595,-0.008509,-0.002711,...,0.000734,-0.008695,-0.016794,0.002627,0.009119,-0.008451,0.007120,-0.011745,0.009092,-0.005110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,0,0.018142,0.009965,0.003532,-0.011716,-0.012525,0.005541,-0.006656,0.006609,-0.004703,-0.007820,...,-0.016788,-0.011269,0.004981,-0.016582,-0.015195,0.002668,-0.008713,0.003021,0.005091,-0.008325
750,0,-0.001236,0.018142,0.009965,0.003532,-0.011716,-0.012525,0.005541,-0.006656,0.006609,-0.004703,...,-0.006459,-0.016788,-0.011269,0.004981,-0.016582,-0.015195,0.002668,-0.008713,0.003021,0.005091
751,0,-0.002732,-0.001236,0.018142,0.009965,0.003532,-0.011716,-0.012525,0.005541,-0.006656,0.006609,...,-0.015513,-0.006459,-0.016788,-0.011269,0.004981,-0.016582,-0.015195,0.002668,-0.008713,0.003021
752,0,0.013074,-0.002732,-0.001236,0.018142,0.009965,0.003532,-0.011716,-0.012525,0.005541,-0.006656,...,-0.032283,-0.015513,-0.006459,-0.016788,-0.011269,0.004981,-0.016582,-0.015195,0.002668,-0.008713


In [9]:
# Step 1: Create a 250x10 matrix A with the two first columns representing the factors of interest

A = np.zeros((250,10))

A[0:5, 0] = 1/np.sqrt(5) # 5-day return factor
A[20:250, 1] = 1/np.sqrt(230) # momentum factor

# Step 2: Fill the remaining columns of A with random orthonormal vectors, that are orthogonal to the two first columns

orthoProj = np.eye(250) - np.outer(A[:, 0], A[:, 0]) - np.outer(A[:, 1], A[:, 1]) # projection matrix on the orthogonal to the span of A[:,0] and A[:,1]
A_remaining_columns = orthoProj @ np.random.randn(250, 8) # sample random vectors in the space orthogonal to the first two columns of A
A_remaining_columns = np.linalg.qr(A_remaining_columns)[0] # orthonormalize these vectors with Gram-Schmidt algorithm
A[:, 2:] = A_remaining_columns

In [10]:
features_df = X_train_reshape @ A[:, :2]
target_df = Y_train.T.stack()

In [11]:
features_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
date,stocksID,Unnamed: 2_level_1,Unnamed: 3_level_1
250,0,0.007969,0.010945
250,1,0.000319,0.007865
250,2,0.009481,0.000716
250,3,0.008433,0.012034
250,4,0.010518,-0.002022
...,...,...,...
753,45,0.000138,-0.067422
753,46,-0.006623,-0.007351
753,47,-0.003211,0.003897
753,48,-0.000640,0.005506


In [12]:
first_company_mask = features_df\
    .index\
    .get_level_values('stocksID') == 0

features_df[first_company_mask]

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1
date,stocksID,Unnamed: 2_level_1,Unnamed: 3_level_1
250,0,0.007969,0.010945
251,0,0.012278,0.011471
252,0,0.010650,0.011916
253,0,0.002480,0.013004
254,0,-0.007423,0.014245
...,...,...,...
749,0,0.003308,-0.014903
750,0,0.008357,-0.014173
751,0,0.012375,-0.012450
752,0,0.016642,-0.013392


In [13]:
target_df[first_company_mask]

date  stocksID
250   0           0.001128
251   0          -0.001046
252   0          -0.007027
253   0          -0.009757
254   0          -0.005868
                    ...   
749   0          -0.001236
750   0          -0.002732
751   0           0.013074
752   0          -0.005843
753   0          -0.003823
Length: 504, dtype: float64

In [14]:
(target_df.index == features_df.index).all()

True

## Machine learning models 

In [15]:
model = LinearRegression(positive= False)

In [16]:
model.fit(features_df, target_df,)

LinearRegression()

In [18]:
beta = np.zeros(10)

In [19]:
for i in range(len(model.coef_)):
    beta[i] = model.coef_[i]

beta 

array([-0.00711245,  0.01444704,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [20]:
df_A_beta = np.hstack( (np.hstack([A.T, beta.reshape((10, 1))])).T )

In [21]:
pd.DataFrame(df_A_beta)

Unnamed: 0,0
0,0.447214
1,0.000000
2,-0.102736
3,-0.060199
4,0.048090
...,...
2505,0.000000
2506,0.000000
2507,0.000000
2508,0.000000


In [22]:
y_pred = transform_submission_to_ypred(A, beta, X_train, Y_train)

  x_test_reshape = pd.concat([x_test.shift(i+1).stack(dropna=False) for i in range(250)], 1).dropna()


In [24]:
metric(Y_train, y_pred)

0.01897622481536137

In [27]:
get_benchmark(X_train, Y_train)

0 metric_train: 0.02331346933035977
1 metric_train: 0.033196962063374115
3 metric_train: 0.035302048607635673
7 metric_train: 0.039995449146626075


KeyboardInterrupt: 