In [5]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [7]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Functions

In [3]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

def estimate_result(y_test, y_pred, X_test=None, y_start_test=None, metric_func=MAE, pct_change=True):
    if not pct_change:
        return metric_func(y_test, y_pred)
    
    df_preds = X_test.copy()
    df_preds['Preds'] = y_pred + 1
    df_preds['Close'] = y_test.reset_index(drop=True) + 1
    
    starts = y_start_test.sort_values('Stock')['Close'].values

    pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
    orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

    pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'], value_name='Pred')
    orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'], value_name='True')

    metric_df = pd.merge(pred_close, orig_close, how='inner', on=['Stock', 'Datetime'])

    return metric_func(metric_df['True'], metric_df['Pred'])

# Config

In [4]:
with open('configs//best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)


In [5]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [6]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [7]:
use_pct_changes_X = False
use_pct_changes_ts2v = False
use_pct_changes_labels = False

In [8]:
ts2vec_device = 2
ts2vec_out_dim = 128

n_shifts = 18

# DataLoading

In [None]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")
df_best.head()

# Preprocessing

In [154]:
df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [155]:
df_original = df_agg.reset_index()
if use_pct_changes_X:
    df = df_agg.groupby('Stock').pct_change().reset_index()
else:
    df = df_original
df.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [156]:
df_use_pct = df_agg.groupby('Stock').pct_change().reset_index()
df_standart = df_original
df_standart.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [157]:
def create_label(df):
    y = df.set_index('Datetime').groupby(
            ['Stock', pd.Grouper(freq='h')],
        ).agg({'Close': 'mean'}).reset_index()
    return y

def process_time_labels(train_start, train_end, test_start, test_end, use_pct_changes_labels=False):
    
    if use_pct_changes_labels:
        y = create_label(df_use_pct)
    else:
        y = create_label(df_standart)

    y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
                (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

    y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
               (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

    return y_train, y_test

y_train, y_test = process_time_labels(train_start, train_end, test_start, test_end, use_pct_changes_labels)


In [158]:
def preprocess_data(df, columns, start_date, end_date, tickers_save):
    data = preprocess_split(df, columns, start_date=start_date, end_date=end_date, tickers_save=tickers_save)
    return data

train_data_use_pct = preprocess_data(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks)
test_data_use_pct = preprocess_data(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks)

train_data_standart = preprocess_data(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks)
test_data_standart = preprocess_data(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks)

NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


## TS2Vec

In [159]:
train_ts_use_pct = data_to_np_tensor(train_data_use_pct)
test_ts_use_pct = data_to_np_tensor(test_data_use_pct)

train_ts_standart = data_to_np_tensor(train_data_standart)
test_ts_standart = data_to_np_tensor(test_data_standart)
train_ts_use_pct.shape, test_ts_use_pct.shape

((28, 132, 5), (28, 24, 5))

In [160]:
def train_model_ts(input_dims, output_dims, use_pct_changes_ts2v=False, device=0):
    model = TS2Vec(
        input_dims=input_dims,
        device=device,
        output_dims=output_dims,
    )
    train_data = train_ts_use_pct if use_pct_changes_ts2v else train_ts_standart
    loss_log = model.fit(train_data, verbose=False)
    
    return model

model = train_model_ts(train_ts_standart.shape[2], ts2vec_out_dim, use_pct_changes_ts2v)


In [161]:
train_data = train_ts_use_pct if use_pct_changes_ts2v else train_ts_standart
test_data = test_ts_use_pct if use_pct_changes_ts2v else test_ts_standart

train_repr = model.encode(train_data)
test_repr = model.encode(test_data)
train_repr.shape, test_repr.shape

((28, 132, 128), (28, 24, 128))

In [162]:
X_train_emb= stock_embeddigns_to_df(train_repr, stocks=train_data_standart['Open'].columns, dates=train_data_standart['Open'].index)
X_test_emb = stock_embeddigns_to_df(test_repr, stocks=test_data_standart['Open'].columns, dates=test_data_standart['Open'].index)


In [None]:
X_emb = pd.concat([X_train_emb, X_test_emb])
y = pd.concat([y_train, y_test])

X_emb.to_csv('data//TS2Vec//x_emb_pct_change.csv', index=False)
y.to_csv('data//TS2Vec//y_pct_change.csv', index=False)

## baseline

In [163]:
def calculate_mean_close(df):
    X_baseline = df.set_index('Datetime').groupby(
        ['Stock', pd.Grouper(freq='h')],
    ).agg({'Close': 'mean'}).reset_index()
    return X_baseline

X_baseline = calculate_mean_close(df)

In [164]:
def add_shifts(df, n_shifts):
    df_copy = df.copy()
    for i in range(1, n_shifts + 1):
        df_copy[f'shift_{i}'] = df_copy.groupby(by=['Stock']).shift(i)['Close']
    df_copy = df_copy.dropna().drop(columns='Close')
    return df_copy

X_baseline= add_shifts(X_baseline, n_shifts)
X_baseline.head()  

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
18,AAPL,2023-02-02 17:00:00,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96,145.13
19,AAPL,2023-02-02 18:00:00,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96
20,AAPL,2023-02-02 19:00:00,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95
21,AAPL,2023-02-02 20:00:00,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17
22,AAPL,2023-02-02 21:00:00,149.81,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38


In [165]:
def filter_by_date_range(df, start_date, end_date):
    filtered_df = df[(df['Datetime'].dt.date >= pd.Timestamp(start_date).date()) & 
                     (df['Datetime'].dt.date < pd.Timestamp(end_date).date())]
    return filtered_df


X_train_base = filter_by_date_range(X_baseline, train_start, train_end)
X_test_base= filter_by_date_range(X_baseline, test_start, test_end)

In [166]:
def merge_data(df_original, X_train_base, column_name):
    orig_vals = df_original.reset_index()
    
    last_train_date = X_train_base.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
    
    y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
    
    return y_start_test


y_start_test = merge_data(df_original, X_train_base, 'Stock')


y_start_test.head()


Unnamed: 0,index,Stock,Datetime,Open,High,Low,Close,Volume
0,1142,AAPL,2023-10-31 22:00:00,170.85,170.9,170.385,170.62,338198
1,2649,ABBV,2023-10-31 22:00:00,141.42,141.65,139.91,140.19,86557
2,4156,ABT,2023-10-31 22:00:00,94.53,94.88,94.35,94.84,56242
3,5663,AMD,2023-10-31 22:00:00,98.49,98.8,98.1,98.44,386922
4,7170,BAC,2023-10-31 22:00:00,26.355,26.375,26.275,26.28,579334


## ts2vec + baseline

In [167]:
X_train_bs_emb = pd.merge(X_train_base, X_train_emb, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb = pd.merge(X_test_base, X_test_emb, on=['Stock', 'Datetime'], how='inner')

## Проверка корректности пайплайна

In [36]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

True True True
-0.0001919874370703276


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,-0.002571,...,-0.040635,-0.091305,-0.072076,-0.304096,-0.070058,-0.291516,-0.277115,-0.172606,-0.45235,0.258093
1,AAPL,2023-10-02 18:00:00,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,...,-0.438274,-0.210274,-0.122721,-0.211918,0.030223,-0.173067,-0.031121,-0.292909,-0.016628,0.041779
2,AAPL,2023-10-02 19:00:00,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,...,-0.055352,-0.229474,-0.140746,0.057177,0.024548,-0.327634,-0.559056,-0.162226,-0.351382,0.347384
3,AAPL,2023-10-02 20:00:00,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,...,-0.368141,-0.242813,-0.053721,-0.168639,-0.078961,-0.092771,-0.05182,-0.309598,-0.29035,0.074978
4,AAPL,2023-10-02 21:00:00,0.000693,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,...,-0.095751,-0.077043,-0.236742,-0.216909,-0.01806,-0.489334,-0.412079,-0.275791,-0.285077,0.228574


In [55]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

True True False
143.50392721861473


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,-0.002571,...,-0.309834,-0.143693,-0.170927,-0.156656,-0.253392,-0.364222,0.02963,-0.212489,-0.062158,-0.019007
1,AAPL,2023-10-02 18:00:00,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,...,-0.2982,-0.121974,-0.269475,-0.132354,0.010465,0.073027,-0.453286,-0.212364,-0.339286,-0.294089
2,AAPL,2023-10-02 19:00:00,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,...,-0.160934,-0.080404,-0.107652,-0.2062,-0.313743,-0.120745,0.038665,-0.026673,-0.181073,0.05308
3,AAPL,2023-10-02 20:00:00,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,...,-0.400727,-0.182231,-0.062356,-0.337698,0.04733,-0.196712,-0.479843,-0.212152,-0.207372,-0.215807
4,AAPL,2023-10-02 21:00:00,0.000693,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,...,-0.226171,-0.087146,-0.38642,0.036291,-0.336852,-0.098545,0.03768,-0.169711,-0.190779,-0.091177


In [79]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

True False True
-0.0001919874370703276


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,-0.002571,...,-3191.566895,-1464.264893,-4625.992188,-5421.251953,-156.910156,-3237.252197,-4350.890137,-3596.445801,-703.334106,-3664.669434
1,AAPL,2023-10-02 18:00:00,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,...,-5874.883789,-2687.276855,-4369.288086,-3713.804688,-390.033691,-3859.708496,-5348.638672,-3832.731445,-3081.747314,-5196.202637
2,AAPL,2023-10-02 19:00:00,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,...,-1400.33374,-3656.311035,-4054.303711,-6859.186035,-2460.474854,-1569.753906,-3847.648438,-2874.88501,-1995.522217,-3313.355469
3,AAPL,2023-10-02 20:00:00,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,...,-1080.904541,-4147.553711,-3684.722168,-8563.469727,-2090.42749,-2621.342285,-3669.620605,-4134.994629,-578.968384,-1909.035767
4,AAPL,2023-10-02 21:00:00,0.000693,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,...,-3680.671875,-600.296326,-4862.900391,-5905.69873,-3349.009521,-4230.870605,-3139.35083,-2948.955566,-2569.77832,-1573.559814


In [99]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

True False False
143.50392721861473


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,-0.002571,...,-2095.422363,-5274.339844,-8092.722656,468.586975,296.529907,1481.156982,287.504395,-1797.963135,-3061.157471,-2497.845703
1,AAPL,2023-10-02 18:00:00,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,-0.000293,...,-6129.604004,-5751.72168,-8189.789551,-4489.480957,-4392.316406,-1445.035889,-2663.813721,-256.984131,-5639.729492,-5081.244629
2,AAPL,2023-10-02 19:00:00,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,0.009961,...,-5116.721191,-3425.077148,-2828.352051,-3077.005127,-6514.491211,717.20813,1142.930908,-1659.281982,-3843.648926,-3800.530273
3,AAPL,2023-10-02 20:00:00,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,0.000464,...,-2655.562988,-4310.99707,-6352.522949,-3391.101807,-8335.350586,1317.139771,1560.960693,-1192.879395,-4605.547363,-3385.725342
4,AAPL,2023-10-02 21:00:00,0.000693,0.000463,-0.000405,0.008627,0.005391,-0.003155,-0.004131,-0.003247,...,-3775.010498,-3674.72168,-3012.789307,-1623.000244,-4909.516113,-1567.182861,-1190.861084,-1274.987061,-2423.763916,-2306.984619


In [118]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

False True True
-0.0001919874370703276


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,171.56,170.64,171.18,171.89,172.45,172.37,170.67,170.72,...,-0.318562,-0.134152,-0.009302,-0.204744,-0.621309,-0.251468,-0.065641,-0.065006,0.348861,-0.299228
1,AAPL,2023-10-02 18:00:00,173.04,171.56,170.64,171.18,171.89,172.45,172.37,170.67,...,0.086067,-0.394035,-0.170114,-0.367443,-0.162809,-0.353836,-0.432603,-0.390306,0.076625,-0.175248
2,AAPL,2023-10-02 19:00:00,172.97,173.04,171.56,170.64,171.18,171.89,172.45,172.37,...,-0.498481,-0.033235,-0.038371,-0.141304,-0.384159,-0.141125,-0.21788,-0.25084,0.361273,-0.192025
3,AAPL,2023-10-02 20:00:00,173.05,172.97,173.04,171.56,170.64,171.18,171.89,172.45,...,0.13083,-0.27269,-0.143551,-0.317708,-0.253439,-0.398165,-0.272726,-0.260997,0.173373,-0.323349
4,AAPL,2023-10-02 21:00:00,173.17,173.05,172.97,173.04,171.56,170.64,171.18,171.89,...,-0.373711,-0.247307,-0.054431,-0.263667,-0.361438,-0.115238,-0.358653,-0.306788,0.31219,-0.093623


In [136]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

False True False
143.50392721861473


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,171.56,170.64,171.18,171.89,172.45,172.37,170.67,170.72,...,-0.32971,0.131216,-0.413908,-0.29802,-0.125435,-0.311199,0.009286,-0.128031,-0.170051,-0.057402
1,AAPL,2023-10-02 18:00:00,173.04,171.56,170.64,171.18,171.89,172.45,172.37,170.67,...,-0.139031,0.29493,0.014425,-0.189447,-0.113415,-0.041211,-0.263204,-0.335526,-0.153549,0.033206
2,AAPL,2023-10-02 19:00:00,172.97,173.04,171.56,170.64,171.18,171.89,172.45,172.37,...,-0.215189,0.151567,-0.29998,-0.336176,-0.077809,-0.330451,-0.165136,-0.015209,-0.266144,-0.229518
3,AAPL,2023-10-02 20:00:00,173.05,172.97,173.04,171.56,170.64,171.18,171.89,172.45,...,-0.149671,0.255155,-0.007788,-0.116678,-0.009159,0.073332,-0.236383,-0.223885,-0.273423,-0.081678
4,AAPL,2023-10-02 21:00:00,173.17,173.05,172.97,173.04,171.56,170.64,171.18,171.89,...,-0.350705,0.207792,-0.330913,-0.375227,-0.181509,-0.354913,-0.109204,-0.233867,-0.053369,-0.017608


In [152]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

False False True
-0.0001919874370703276


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,171.56,170.64,171.18,171.89,172.45,172.37,170.67,170.72,...,-1809.658203,1215.92627,-2366.711426,-5893.969727,-1228.188843,-2747.72583,-4141.385742,-992.391846,-1365.353149,811.320557
1,AAPL,2023-10-02 18:00:00,173.04,171.56,170.64,171.18,171.89,172.45,172.37,170.67,...,-225.890808,-1191.375977,-7374.98877,-9091.510742,-1578.058105,-7168.187012,-1618.022217,1063.340088,-704.336426,77.614288
2,AAPL,2023-10-02 19:00:00,172.97,173.04,171.56,170.64,171.18,171.89,172.45,172.37,...,487.206268,-2727.513184,-1206.471191,-6632.311523,-209.881989,-2240.798096,-5639.889648,-3320.949707,289.052002,-1124.070557
3,AAPL,2023-10-02 20:00:00,173.05,172.97,173.04,171.56,170.64,171.18,171.89,172.45,...,-490.065308,-3760.56543,-2679.577637,-7035.020508,-575.558533,-1352.439941,-7409.337402,-3500.990723,-1648.235229,-2299.382568
4,AAPL,2023-10-02 21:00:00,173.17,173.05,172.97,173.04,171.56,170.64,171.18,171.89,...,-11.924805,-2494.869141,113.289581,-1973.255981,-991.633911,-1914.773438,-4551.407227,-2848.47998,76.404358,-1668.231567


In [168]:
print(use_pct_changes_X, use_pct_changes_ts2v, use_pct_changes_labels)
print(y_train['Close'].mean())
X_train_bs_emb.head()

False False False
143.50392721861473


Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,AAPL,2023-10-02 17:00:00,171.56,170.64,171.18,171.89,172.45,172.37,170.67,170.72,...,-3965.819092,-2178.022705,-930.884949,-1229.778687,-797.623291,-3384.263672,-2512.143799,-2415.182861,-2061.875732,-3580.733643
1,AAPL,2023-10-02 18:00:00,173.04,171.56,170.64,171.18,171.89,172.45,172.37,170.67,...,-2783.727051,-3419.083496,-3119.878418,-1873.700195,-2928.850586,-5776.874023,-3144.419434,-4144.733887,-2123.118896,-5237.150391
2,AAPL,2023-10-02 19:00:00,172.97,173.04,171.56,170.64,171.18,171.89,172.45,172.37,...,-2879.89624,-2990.200195,-708.419373,-3053.038574,-1846.677979,-4826.688477,-1883.761475,-4060.516602,-3237.503418,-2356.187988
3,AAPL,2023-10-02 20:00:00,173.05,172.97,173.04,171.56,170.64,171.18,171.89,172.45,...,-6231.982422,-2956.620117,-758.753845,-3980.380127,-1726.062988,-5056.699219,-4101.257324,-1603.067627,-3879.436768,-2980.761719
4,AAPL,2023-10-02 21:00:00,173.17,173.05,172.97,173.04,171.56,170.64,171.18,171.89,...,-3026.091064,-912.690613,1520.349243,-3122.633789,469.261505,-2282.36377,283.600098,-3014.780273,-3142.415039,-2161.932861


# Prediction 

In [33]:
datasets_standart = {
    'emb': {
        'X_train': X_train_emb.reset_index(drop=True), 
        'X_test': X_test_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base.reset_index(drop=True), 
        'X_test': X_test_base.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb.reset_index(drop=True), 
        'X_test': X_test_bs_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True),  
        'y_test': y_test['Close'].reset_index(drop=True),
    },
}


models = {
    'lin_reg': LinearRegression(),
    'ctb': CatBoostRegressor(),
    'rf': RandomForestRegressor(),
    'knn' : KNeighborsRegressor(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'svr' : SVR(),
    'xgb' : XGBRegressor()
    
}

In [34]:
df_results_standart = pd.DataFrame([])

In [None]:
for ds_name, data in datasets_standart.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=use_pct_changes_X)
        print('MAPE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results_standart = pd.concat([df_results_standart, metrics_df])

df_results_standart = df_results_standart.reset_index()

In [36]:
df_results_standart.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ctb,0.021955,0.7538,0.03212
knn,0.014717,0.814492,0.767451
lasso,0.003693,0.746243,0.004721
lin_reg,0.003795,0.754793,0.00532
rf,0.007307,1.014701,0.007418
ridge,0.003795,0.754567,0.00532
svr,0.032644,0.710667,0.708644


## Predictions

In [38]:
datasets_use_pct = {
    'emb': {
        'X_train': X_train_emb.reset_index(drop=True), 
        'X_test': X_test_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base.reset_index(drop=True), 
        'X_test': X_test_base.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb.reset_index(drop=True), 
        'X_test': X_test_bs_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True),  
        'y_test': y_test['Close'].reset_index(drop=True),
    },
}

In [39]:
df_results = pd.DataFrame([])

In [None]:
for ds_name, data in datasets_use_pct.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])

        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=use_pct_changes_X)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results= pd.concat([df_results, metrics_df])

df_results_use_pct = df_results.reset_index()

In [41]:
df_results_use_pct.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ctb,0.030522,0.02377,0.031262
knn,0.030844,0.035533,0.035634
lasso,0.028866,0.028866,0.028866
lin_reg,0.029328,0.015617,0.01548
rf,0.030354,0.027579,0.031754
ridge,0.028951,0.031195,0.031696
svr,0.358289,0.377171,0.378243


## Подбор гиперпараметров

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'lin_reg': {}, 
    'ctb': {
        'depth': [4, 6, 8, 12],
        'learning_rate': [0.06, 0.1, 0.3],
        'iterations': [100, 200, 300, 600]
    },
    'rf': {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 15, 30],
        'weights': ['uniform', 'distance']
    },
    'lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'svr': {
        'C': [1, 10, 100],
        'gamma': ['scale', 'auto']
    }
}

def train_and_evaluate_models(datasets, models, param_grids):
    df_results_use_pct = pd.DataFrame()

    for ds_name, data in datasets.items():
        for model_name, model in models.items():
            print(ds_name, model_name)
            
            grid_search = RandomizedSearchCV(model, param_grids[model_name], n_iter=5, scoring='neg_mean_absolute_percentage_error', cv=3)
            grid_search.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
            
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

            metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=True)
            print('MAPE:', metric)

            metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
            df_results_use_pct = pd.concat([df_results_use_pct, metrics_df])

    df_results_use_pct = df_results_use_pct.reset_index()

    return df_results_use_pct


df_results_use_pct = train_and_evaluate_models(datasets_use_pct, models, param_grid)