In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [54]:
import datetime
import pandas as pd
import numpy as np
import torch

from chronos import ChronosPipeline

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from src.data.load_data import pipeline_data
from src.models.ts2vec_src.ts2vec import TS2Vec

from src.experiments import (
    LagModelExperint, 
    SelfSupervisedExperint, 
    FoundationZeroShort, 
    ConstPredExperiment
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Config

In [3]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [4]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [5]:
use_pct_changes_X = False
use_pct_changes_ts2v = False
use_pct_changes_labels = False

In [6]:
ts2vec_device = 1
ts2vec_out_dim = 128

n_shifts = 18

In [7]:
ticker_data_path = 'configs/best_stocks_nans_rate.yaml'
ticker_data_path = 'data/all_tickers.csv'

# DataLoading

In [8]:
df = pipeline_data(col_agg_finctions=col_agg_finctions)

In [22]:
emb_model = TS2Vec(input_dims=4, device=2, output_dims=ts2vec_out_dim)
lag_model_exp = SelfSupervisedExperint(
    model = LinearRegression(),
    emb_model = emb_model,
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = False,
    use_pct_changes_labels = True,
)

results, preds = lag_model_exp.pipeline(df)
results

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


0.004597488752695014

In [19]:



lag_model_exp = LagModelExperint(
    lag_model = LinearRegression(),
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = True,
    use_pct_changes_labels = True,
)

results, preds = lag_model_exp.pipeline(df)
results

0.0037235051873516062

In [38]:
const_exp = ConstPredExperiment(
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = False,
    use_pct_changes_labels = False,
)

results, preds_const = const_exp.pipeline(df)
results

0.0036556287716867633

In [42]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-base",
    device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)

zs_model_exp = FoundationZeroShort(
    model = pipeline,
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = True,
    use_pct_changes_labels = True,
)

results, preds_zs = zs_model_exp.pipeline(df)
results



  0%|          | 0/24 [00:00<?, ?it/s]

0.0036549642681785394

## multirun

In [58]:
models = {
    'lr': LinearRegression(),
    'rf': RandomForestRegressor(),
    'knn': KNeighborsRegressor(),
    'svm': SVR(),
    'lasso': Lasso(),
    'xgb': XGBRegressor(),
    'catbst': CatBoostRegressor(),
}

cols = ['exp', 'model', 'pct_label', 'pct_data', 'metric']
res_df = pd.DataFrame(columns=cols)

In [59]:
emb_model = TS2Vec(input_dims=4, device=2, output_dims=ts2vec_out_dim)
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-base",
    device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)

for pct_lab in [True, False]:
    for pct_data in [True, False]:

        print('pct_lab:', pct_lab, ',pct_data:', pct_data)
        if pct_data == pct_lab:

            res_zs, _ = FoundationZeroShort(
                model = pipeline,
                train_start = train_start, 
                train_end = train_end, 
                test_start = test_start, 
                test_end = test_end,
                label_name = 'Close',
                use_pct_changes_data = pct_data,
                use_pct_changes_labels = pct_lab,
            ).pipeline(df)
            res_df = pd.concat([
                res_df, 
                pd.DataFrame([['zero-short', '-', pct_lab, pct_data, res_zs]], columns=cols)
            ])

            res_const, _ = ConstPredExperiment(
                train_start = train_start, 
                train_end = train_end, 
                test_start = test_start, 
                test_end = test_end,
                label_name = 'Close',
                use_pct_changes_data = pct_data,
                use_pct_changes_labels = pct_lab,
            ).pipeline(df)
            res_df = pd.concat([
                res_df, 
                pd.DataFrame([['const', '-', pct_lab, pct_data, res_const]], columns=cols)
            ])

        for model_name, model in models.items():
            res_ss, _ = SelfSupervisedExperint(
                model = model,
                emb_model = emb_model,
                train_start = train_start, 
                train_end = train_end, 
                test_start = test_start, 
                test_end = test_end,
                label_name = 'Close',
                use_pct_changes_data = pct_data,
                use_pct_changes_labels = pct_lab,
            ).pipeline(df)

            res_df = pd.concat([
                res_df, 
                pd.DataFrame([['self-supervised', model_name, pct_lab, pct_data, res_ss]], columns=cols)
            ])

            res_lag, _ = LagModelExperint(
                model = model,
                train_start = train_start, 
                train_end = train_end, 
                test_start = test_start, 
                test_end = test_end,
                label_name = 'Close',
                use_pct_changes_data = pct_data,
                use_pct_changes_labels = pct_lab,
            ).pipeline(df)

            res_df = pd.concat([
                res_df, 
                pd.DataFrame([['lag', model_name, pct_lab, pct_data, res_lag]], columns=cols)
            ])




pct_lab: True ,pct_data: True


  0%|          | 0/24 [00:00<?, ?it/s]

  res_df = pd.concat([


Learning rate set to 0.050336
0:	learn: 0.0076696	total: 53.3ms	remaining: 53.3s
1:	learn: 0.0076500	total: 58.9ms	remaining: 29.4s
2:	learn: 0.0076200	total: 64.2ms	remaining: 21.3s
3:	learn: 0.0075986	total: 69.2ms	remaining: 17.2s
4:	learn: 0.0075777	total: 74.3ms	remaining: 14.8s
5:	learn: 0.0075511	total: 78.4ms	remaining: 13s
6:	learn: 0.0075266	total: 82.4ms	remaining: 11.7s
7:	learn: 0.0075172	total: 86.1ms	remaining: 10.7s
8:	learn: 0.0074917	total: 89.9ms	remaining: 9.9s
9:	learn: 0.0074549	total: 93.4ms	remaining: 9.25s
10:	learn: 0.0074301	total: 97ms	remaining: 8.72s
11:	learn: 0.0074007	total: 100ms	remaining: 8.27s
12:	learn: 0.0073801	total: 104ms	remaining: 7.9s
13:	learn: 0.0073558	total: 107ms	remaining: 7.56s
14:	learn: 0.0073320	total: 111ms	remaining: 7.27s
15:	learn: 0.0073134	total: 114ms	remaining: 7.01s
16:	learn: 0.0072973	total: 117ms	remaining: 6.76s
17:	learn: 0.0072821	total: 120ms	remaining: 6.53s
18:	learn: 0.0072651	total: 122ms	remaining: 6.32s
19:	le

  0%|          | 0/24 [00:00<?, ?it/s]

  model = cd_fast.enet_coordinate_descent(


Learning rate set to 0.050336
0:	learn: 102.9179486	total: 9.06ms	remaining: 9.05s
1:	learn: 102.4152318	total: 13.3ms	remaining: 6.64s
2:	learn: 101.8824234	total: 17.2ms	remaining: 5.73s
3:	learn: 101.3788088	total: 21.1ms	remaining: 5.26s
4:	learn: 100.8579807	total: 25.1ms	remaining: 5s
5:	learn: 100.4504707	total: 28.9ms	remaining: 4.79s
6:	learn: 99.9169343	total: 32.6ms	remaining: 4.63s
7:	learn: 99.5989670	total: 36ms	remaining: 4.46s
8:	learn: 99.2730258	total: 39.6ms	remaining: 4.36s
9:	learn: 98.9405351	total: 43ms	remaining: 4.25s
10:	learn: 98.4879531	total: 46.3ms	remaining: 4.17s
11:	learn: 98.1771908	total: 50ms	remaining: 4.12s
12:	learn: 97.7942371	total: 53.2ms	remaining: 4.04s
13:	learn: 97.3467341	total: 56.4ms	remaining: 3.97s
14:	learn: 96.9623555	total: 59.7ms	remaining: 3.92s
15:	learn: 96.6910793	total: 62.7ms	remaining: 3.85s
16:	learn: 96.3483565	total: 65.7ms	remaining: 3.8s
17:	learn: 95.9957698	total: 68.7ms	remaining: 3.75s
18:	learn: 95.6485692	total: 7

In [62]:
res_df.sort_values('metric')

Unnamed: 0,exp,model,pct_label,pct_data,metric
0,zero-short,-,True,True,0.003652
0,const,-,False,False,0.003656
0,self-supervised,lasso,True,True,0.003689
0,lag,lasso,True,False,0.003689
0,lag,lasso,True,True,0.003689
0,lag,lasso,False,False,0.003693
0,lag,lr,True,False,0.003704
0,lag,lr,True,True,0.003724
0,self-supervised,catbst,True,True,0.003738
0,self-supervised,lasso,True,False,0.003787


In [61]:
res_df.to_csv('results/baseline_MAPE.csv')