In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
from utils.data_helper import *
from utils.data import *
from utils.performance import *
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from strategy_v4.Data.data import DataLayer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
start_date = datetime(2017,1,1)
end_date = datetime(2024,1,29)

In [19]:
data = DataLayer(start_date, end_date)
data.load()
data.process()
data.upload()
df = data.df

[32;20m2025-01-30 00:53:56,832 - Data Layer - INFO - start_date: 2017-01-01[0m
[32;20m2025-01-30 00:53:56,833 - Data Layer - INFO - end_date: 2024-01-29[0m


# Try to use price to generate more non-linear combination of features

In [None]:
assets = list(df['Stock'].unique())
pred_res = []
df['label'] = df['Return5d'].shift(-5)

with tqdm(total=len(assets)) as pbar:
    for asset in assets:
        df_ = df[df['Stock'] == asset].drop(columns=['Stock']).set_index('Date')
        df_ = df_.iloc[60:]
        df_ = df_.dropna()

        features = [x for x in df_.columns if x != 'label']
        X, y = df_[features], df_['label']
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

        model = Lasso(alpha=0.05)        
        model = model.fit(X_train, y_train)        

        y_pred_in = model.predict(X_train)
        y_pred_out = model.predict(X_test)
        
        y_pred_out = pd.DataFrame({'actual': y_test, 'pred': y_pred_out})
        y_pred_out['type'] = 'out-sample'

        y_pred_in = pd.DataFrame({'actual': y_train, 'pred': y_pred_in})
        y_pred_in['type'] = 'in-sample'

        res = pd.concat([y_pred_out, y_pred_in])
        res['Stock'] = asset
        pred_res.append(res)                
        pbar.update(1)        

 28%|██▊       | 140/503 [00:57<02:45,  2.20it/s]

In [45]:
def eval_metrics(x):
    e = {}
    e['r2'] = r2_score(x['actual'], x['pred'])
    e['mse'] = mean_squared_error(x['actual'], x['pred'])
    e['mae'] = mean_absolute_error(x['actual'], x['pred'])

    return pd.Series(e)

df_pred = pd.concat(pred_res)
df_pred = df_pred.groupby('type').apply(eval_metrics)
df_pred

Unnamed: 0_level_0,r2,mse,mae
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
in-sample,0.042307,0.843983,0.622948
out-sample,0.012569,0.881048,0.629444
