In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [17]:
import random
import pandas as pd
import numpy as np
import torch

from chronos import ChronosPipeline

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.metrics import accuracy_score, roc_auc_score

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

from src.data.load_data import pipeline_data
from src.models.ts2vec_src.ts2vec import TS2Vec

from src.experiments import (
    LagModelExperint, 
    SelfSupervisedExperint, 
    FoundationZeroShort, 
    ConstPredExperiment
)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

# Config

In [4]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [5]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [6]:
use_pct_changes_X = False
use_pct_changes_ts2v = False
use_pct_changes_labels = False

In [7]:
ts2vec_device = 1
ts2vec_out_dim = 128

n_shifts = 18

In [8]:
ticker_data_path = 'configs/best_stocks_nans_rate.yaml'
ticker_data_path = 'data/all_tickers.csv'

# DataLoading

In [9]:
df = pipeline_data(col_agg_finctions=col_agg_finctions)

In [10]:
emb_model = TS2Vec(input_dims=4, device=2, output_dims=ts2vec_out_dim)
emb_model_exp = SelfSupervisedExperint(
    model = LogisticRegression(),
    emb_model = emb_model,
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = False,
    use_pct_changes_labels = True,
    task = 'classification',
)

results, preds = emb_model_exp.pipeline(df, metric_func=accuracy_score)
results

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.42410714285714285

In [43]:
lag_model_exp = LagModelExperint(
    model = CatBoostClassifier(),
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = True,
    use_pct_changes_labels = True,
    task = 'classification',
)

results, preds = lag_model_exp.pipeline(df, metric_func=accuracy_score)
results

0.4851190476190476

In [26]:
const_exp = ConstPredExperiment(
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = False,
    use_pct_changes_labels = False,
    task = 'classification',
)

results, preds = const_exp.pipeline(df, metric_func=accuracy_score)
results

0.5714285714285714

In [28]:
pipeline = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-tiny",
    device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)

zs_model_exp = FoundationZeroShort(
    model = pipeline,
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = True,
    use_pct_changes_labels = True,
    task = 'classification',
)

results, preds = zs_model_exp.pipeline(df, metric_func=accuracy_score)
results

100%|██████████| 24/24 [00:03<00:00,  6.68it/s]


0.5535714285714286

## multirun

In [15]:
regression_models = {
    'lr': LinearRegression(),
    'rf': RandomForestRegressor(),
    'knn': KNeighborsRegressor(),
    'svm': SVR(),
    'lasso': Lasso(),
    'xgb': XGBRegressor(),
    'catbst': CatBoostRegressor(),
}
classification_models = {
    'lr': LogisticRegression(),
    'rf': RandomForestClassifier(),
    'knn': KNeighborsClassifier(),
    'xgb': XGBClassifier(),
    'catbst': CatBoostClassifier(),
}

In [39]:
def multirun_pipeline(
    emb_model,
    zs_model,
    model_pool, 
    main_task: str='regression',
    metrics = {'MAPE': MAPE},
):
    
    cols = ['exp', 'model', 'pct_label', 'pct_data']
    res_df = pd.DataFrame(columns=cols)
    pct_lab_base =  [True, False] if main_task == 'regression' else [True]
    for pct_lab in pct_lab_base:
        for pct_data in [True, False]:
            print('pct_lab:', pct_lab, ',pct_data:', pct_data)

            default_params = {
                'train_start': train_start, 
                'train_end': train_end, 
                'test_start': test_start, 
                'test_end': test_end,
                'label_name': 'Close',
                'use_pct_changes_data': pct_data,
                'use_pct_changes_labels': pct_lab,
                'task': main_task
            }
            if pct_data == pct_lab:
                res_loc, _ = FoundationZeroShort(model=zs_model, **default_params).pipeline(df, metric_dict=metrics)
                print(res_loc)
                res_df_loc = pd.DataFrame([['zero-short', '-', pct_lab, pct_data]], columns=cols)
                for metric_name, metric_val in res_loc.items():
                    res_df_loc[metric_name] = metric_val
                res_df = pd.concat([res_df, res_df_loc])

                res_loc, _ = ConstPredExperiment(**default_params).pipeline(df, metric_dict=metrics)
                res_df_loc = pd.DataFrame([['const', '-', pct_lab, pct_data]], columns=cols)
                for metric_name, metric_val in res_loc.items():
                    res_df_loc[metric_name] = metric_val
                res_df = pd.concat([res_df, res_df_loc])


            for model_name, model in model_pool.items():
                res_loc, _ = SelfSupervisedExperint(model=model, emb_model=emb_model,**default_params).pipeline(df, metric_dict=metrics)
                res_df_loc = pd.DataFrame([['self-supervised', model_name, pct_lab, pct_data]], columns=cols)
                for metric_name, metric_val in res_loc.items():
                    res_df_loc[metric_name] = metric_val
                res_df = pd.concat([res_df, res_df_loc])

                res_loc, _ = LagModelExperint(model=model, **default_params).pipeline(df, metric_dict=metrics)
                res_df_loc = pd.DataFrame([['lag', model_name, pct_lab, pct_data]], columns=cols)
                for metric_name, metric_val in res_loc.items():
                    res_df_loc[metric_name] = metric_val
                res_df = pd.concat([res_df, res_df_loc])
                
    return res_df



In [40]:
emb_model = TS2Vec(input_dims=4, device=2, output_dims=ts2vec_out_dim)
zs_model = ChronosPipeline.from_pretrained(
    "amazon/chronos-t5-tiny",
    device_map="cuda:2",  # use "cpu" for CPU inference and "mps" for Apple Silicon
    torch_dtype=torch.bfloat16,
)



In [41]:
res_df_classification = multirun_pipeline(
    emb_model, 
    zs_model, 
    classification_models, 
    main_task='classification', 
    metrics={'Accuracy': accuracy_score, 'ROCAUC': roc_auc_score}
)

# res_df_classification.to_csv('results/baseline_classification.csv')

pct_lab: True ,pct_data: True


100%|██████████| 24/24 [00:00<00:00, 24.80it/s]


{'Accuracy': 0.5625, 'ROCAUC': 0.5260416666666666}
pct_lab: True ,pct_data: False


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
res_df_classification.sort_values('Accuracy', ascending=False)

Unnamed: 0,exp,model,pct_label,pct_data,Accuracy,ROCAUC
0,const,-,True,True,0.571429,0.558594
0,zero-short,-,True,True,0.5625,0.526042
0,self-supervised,lr,True,True,0.40625,0.470007
0,self-supervised,rf,True,False,0.358631,0.516091
0,self-supervised,lr,True,False,0.327381,0.436352
0,self-supervised,xgb,True,False,0.327381,0.491898
0,self-supervised,catbst,True,False,0.321429,0.462565
0,self-supervised,knn,True,False,0.299107,0.525124
0,self-supervised,catbst,True,True,0.297619,0.439028
0,lag,knn,True,False,0.297619,0.520124


In [None]:
res_df_regression = multirun_pipeline(emb_model, zs_model, regression_models, main_task='regression')
#res_df_regression.to_csv('results/baseline_regression_MAPE.csv')

In [17]:
res_df_regression