In [2]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml
import random
import seaborn as sns
from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from models.t2v import *
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# Preprocessing

# mean stock price prediction

In [3]:
with open('configs/best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())

In [None]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")

In [5]:
mask = df['Stock'].apply(lambda x: x in best_stocks) 

In [6]:
df[mask].head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Datetime,Stock,Day_week
61,2024-01-30,22:58:00,188.13,188.17,188.11,188.14,1500,2024-01-30 22:58:00,AAPL,Tuesday
62,2024-01-30,22:57:00,188.22,188.22,188.08,188.11,5700,2024-01-30 22:57:00,AAPL,Tuesday
63,2024-01-30,22:56:00,188.22,188.27,188.215,188.22,2600,2024-01-30 22:56:00,AAPL,Tuesday
64,2024-01-30,22:55:00,188.15,188.2,188.14,188.2,2100,2024-01-30 22:55:00,AAPL,Tuesday
65,2024-01-30,22:54:00,188.18,188.18,188.14,188.18,2013,2024-01-30 22:54:00,AAPL,Tuesday


In [7]:
dd = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [8]:
df_best_h = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(dd)

df_best_h.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [9]:
df = df_best_h.groupby('Stock').pct_change().reset_index()

In [10]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'


train_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],         
    start_date = train_start,
    end_date = train_end,
    tickers_save = best_stocks
)

test_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],          
    start_date = test_start,
    end_date = test_end,
    tickers_save = best_stocks
)

NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


In [11]:
train_ts = data_to_np_tensor(train_data)
test_ts = data_to_np_tensor(test_data)

train_ts.shape, test_ts.shape

((28, 132, 5), (28, 24, 5))

In [12]:

from models.t2v import ts2vec
from models.t2v.ts2vec import TS2Vec

model = TS2Vec(
    input_dims=train_ts.shape[2],
    device='cpu',
    output_dims=128
)

loss_log = model.fit(
    train_ts,
    verbose=False
)

In [13]:
train_repr = model.encode(train_ts)
test_repr = model.encode(test_ts)

train_repr.shape, test_repr.shape

((28, 132, 128), (28, 24, 128))

In [14]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

X_train = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train.head()

Unnamed: 0,Datetime,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,Stock
0,2023-10-02 17:00:00,0.178085,-0.533866,-0.234123,-0.114266,-0.190026,-0.168763,-0.197685,0.125318,-0.221335,...,-0.279676,-0.001371,-0.058396,-0.299537,-0.144431,-0.458052,-0.044706,0.03626,0.025789,AAPL
1,2023-10-02 18:00:00,-0.538837,-0.238738,-0.110322,-0.121523,-0.094981,-0.139494,-0.295027,-0.148487,-0.006937,...,-0.227457,-0.531321,-0.186343,-0.098795,-0.36383,-0.14997,-0.247518,-0.305586,-0.526744,AAPL
2,2023-10-02 19:00:00,0.216258,-0.3356,-0.120926,-0.09015,-0.441638,-0.194169,-0.03721,0.113333,-0.401908,...,-0.055215,-0.085676,-0.163349,-0.409503,-0.101752,-0.302617,-0.230867,-0.24286,-0.203321,AAPL
3,2023-10-02 20:00:00,-0.446254,-0.335199,-0.078502,-0.163655,-0.034445,-0.062393,-0.270282,-0.144618,-0.112244,...,-0.205947,-0.45819,-0.22247,-0.25209,-0.321782,-0.251838,-0.221336,-0.076145,-0.333897,AAPL
4,2023-10-02 21:00:00,0.099308,-0.443908,-0.244715,-0.058228,-0.37264,-0.248114,-0.170003,-0.007622,-0.174488,...,-0.257884,-0.164924,-0.052361,-0.226918,-0.126118,-0.308527,-0.169839,-0.279804,-0.295615,AAPL


In [16]:
X_emd = pd.concat([X_train, X_test])

In [15]:
y = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
y.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,144.833667
1,AAPL,2023-01-30 18:00:00,144.159833
2,AAPL,2023-01-30 19:00:00,143.951083
3,AAPL,2023-01-30 20:00:00,143.681667
4,AAPL,2023-01-30 21:00:00,143.388167


In [19]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [20]:
y_emd = pd.concat([y_train, y_test])

In [21]:
X_emd.to_csv('data/x_emb.csv', index=False)
y_emd.to_csv('data/y_emb.csv', index=False)

In [22]:
y_name = 'Close'

df_train = pd.merge(X_train, y_train, on=['Stock', 'Datetime'])
df_test = pd.merge(X_test, y_test, on=['Stock', 'Datetime'])

X_train, y_train = df_train.drop(columns=[y_name, 'Datetime', 'Stock']), df_train[y_name]
X_test, y_test = df_test.drop(columns=[y_name, 'Datetime', 'Stock']), df_test[y_name]

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE
from xgboost import XGBRegressor

model = XGBRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

MAE(y_test, y_pred)

90.89186734483361

# Hard baseline

In [24]:
X_baseline = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
X_baseline.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,144.833667
1,AAPL,2023-01-30 18:00:00,144.159833
2,AAPL,2023-01-30 19:00:00,143.951083
3,AAPL,2023-01-30 20:00:00,143.681667
4,AAPL,2023-01-30 21:00:00,143.388167


In [25]:
n_shifts = 18
for i in range(1, n_shifts + 1):
    X_baseline[f'shift_{i}'] = X_baseline.groupby(by=['Stock']).shift(i)['Close']

X_baseline = X_baseline.dropna()
X_baseline = X_baseline.drop(columns='Close')
X_baseline.head()

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
18,AAPL,2023-02-02 17:00:00,143.121271,142.1865,142.061917,142.751167,142.929167,143.026833,143.825508,143.47225,143.434917,143.335833,143.01,142.988,143.219153,143.388167,143.681667,143.951083,144.159833,144.833667
19,AAPL,2023-02-02 18:00:00,149.049167,143.121271,142.1865,142.061917,142.751167,142.929167,143.026833,143.825508,143.47225,143.434917,143.335833,143.01,142.988,143.219153,143.388167,143.681667,143.951083,144.159833
20,AAPL,2023-02-02 19:00:00,149.299833,149.049167,143.121271,142.1865,142.061917,142.751167,142.929167,143.026833,143.825508,143.47225,143.434917,143.335833,143.01,142.988,143.219153,143.388167,143.681667,143.951083
21,AAPL,2023-02-02 20:00:00,149.979417,149.299833,149.049167,143.121271,142.1865,142.061917,142.751167,142.929167,143.026833,143.825508,143.47225,143.434917,143.335833,143.01,142.988,143.219153,143.388167,143.681667
22,AAPL,2023-02-02 21:00:00,149.869333,149.979417,149.299833,149.049167,143.121271,142.1865,142.061917,142.751167,142.929167,143.026833,143.825508,143.47225,143.434917,143.335833,143.01,142.988,143.219153,143.388167


In [26]:
y = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
y.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,144.833667
1,AAPL,2023-01-30 18:00:00,144.159833
2,AAPL,2023-01-30 19:00:00,143.951083
3,AAPL,2023-01-30 20:00:00,143.681667
4,AAPL,2023-01-30 21:00:00,143.388167


In [27]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train_baseline = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test_baseline = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

X_train_baseline = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(train_end).date())]

X_test_baseline = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [28]:
y_name = 'Close'

df_train = pd.merge(X_train_baseline, y_train_baseline, on=['Stock', 'Datetime'])
df_test = pd.merge(X_test_baseline, y_test_baseline, on=['Stock', 'Datetime'])

X_train_baseline, y_train_baseline = df_train.drop(columns=[y_name, 'Datetime', 'Stock']), df_train[y_name]
X_test_baseline, y_test_baseline = df_test.drop(columns=[y_name, 'Datetime', 'Stock']), df_test[y_name]

In [29]:
X_train_baseline.head()

Unnamed: 0,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
0,171.311695,170.744583,171.005583,171.718583,172.111083,172.523333,170.999915,170.717667,170.963167,171.560167,170.608667,169.657333,170.279068,169.947333,169.615583,169.9375,170.6825,170.882167
1,173.344,171.311695,170.744583,171.005583,171.718583,172.111083,172.523333,170.999915,170.717667,170.963167,171.560167,170.608667,169.657333,170.279068,169.947333,169.615583,169.9375,170.6825
2,173.639167,173.344,171.311695,170.744583,171.005583,171.718583,172.111083,172.523333,170.999915,170.717667,170.963167,171.560167,170.608667,169.657333,170.279068,169.947333,169.615583,169.9375
3,173.2195,173.639167,173.344,171.311695,170.744583,171.005583,171.718583,172.111083,172.523333,170.999915,170.717667,170.963167,171.560167,170.608667,169.657333,170.279068,169.947333,169.615583
4,173.224333,173.2195,173.639167,173.344,171.311695,170.744583,171.005583,171.718583,172.111083,172.523333,170.999915,170.717667,170.963167,171.560167,170.608667,169.657333,170.279068,169.947333


In [30]:
X_train_baseline.to_csv('data/X_train_baseline.csv', index=False)
y_train_baseline.to_csv('data/y_train_baseline.csv', index=False)
X_test_baseline.to_csv('data/X_test_baseline.csv', index=False)
y_test_baseline.to_csv('data/y_test_baseline.csv', index=False)

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE

model_baseline = RandomForestRegressor()

model_baseline.fit(X_train_baseline, y_train_baseline)
y_pred_baseline = model_baseline.predict(X_test_baseline)

MAE(y_test_baseline, y_pred_baseline)

1.0431735740973136

# Simple baseline

In [32]:
y_pred = X_test_baseline['shift_1']
y_test_sh = y_test

MAE(y_test_sh, y_pred)

0.5160046786728717

## Baseline + TS2Vec

In [33]:
shifts = X_train_baseline

In [34]:
shifts_test = X_test_baseline

In [35]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

X_train = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train.head()

Unnamed: 0,Datetime,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,Stock
0,2023-10-02 17:00:00,0.178085,-0.533866,-0.234123,-0.114266,-0.190026,-0.168763,-0.197685,0.125318,-0.221335,...,-0.279676,-0.001371,-0.058396,-0.299537,-0.144431,-0.458052,-0.044706,0.03626,0.025789,AAPL
1,2023-10-02 18:00:00,-0.538837,-0.238738,-0.110322,-0.121523,-0.094981,-0.139494,-0.295027,-0.148487,-0.006937,...,-0.227457,-0.531321,-0.186343,-0.098795,-0.36383,-0.14997,-0.247518,-0.305586,-0.526744,AAPL
2,2023-10-02 19:00:00,0.216258,-0.3356,-0.120926,-0.09015,-0.441638,-0.194169,-0.03721,0.113333,-0.401908,...,-0.055215,-0.085676,-0.163349,-0.409503,-0.101752,-0.302617,-0.230867,-0.24286,-0.203321,AAPL
3,2023-10-02 20:00:00,-0.446254,-0.335199,-0.078502,-0.163655,-0.034445,-0.062393,-0.270282,-0.144618,-0.112244,...,-0.205947,-0.45819,-0.22247,-0.25209,-0.321782,-0.251838,-0.221336,-0.076145,-0.333897,AAPL
4,2023-10-02 21:00:00,0.099308,-0.443908,-0.244715,-0.058228,-0.37264,-0.248114,-0.170003,-0.007622,-0.174488,...,-0.257884,-0.164924,-0.052361,-0.226918,-0.126118,-0.308527,-0.169839,-0.279804,-0.295615,AAPL


In [36]:
y = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
y.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,144.833667
1,AAPL,2023-01-30 18:00:00,144.159833
2,AAPL,2023-01-30 19:00:00,143.951083
3,AAPL,2023-01-30 20:00:00,143.681667
4,AAPL,2023-01-30 21:00:00,143.388167


In [37]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train_baseline_t2v = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test_baseline_t2v = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [38]:
y_name = 'Close'

df_train = pd.merge(X_train, y_train_baseline_t2v, on=['Stock', 'Datetime'])
df_test = pd.merge(X_test, y_test_baseline_t2v, on=['Stock', 'Datetime'])

X_train_baseline_t2v, y_train_baseline_t2v = df_train.drop(columns=[y_name, 'Datetime', 'Stock']), df_train[y_name]
X_test_baseline_t2v, y_test_baseline_t2v = df_test.drop(columns=[y_name, 'Datetime', 'Stock']), df_test[y_name]

In [39]:
X_train_baseline_t2v = pd.concat([X_train_baseline_t2v, shifts], axis=1)
X_test_baseline_t2v = pd.concat([X_test_baseline_t2v, shifts_test], axis=1)

In [46]:
X_train_baseline_t2v.to_csv('data/X_train_baseline_t2v.csv', index=False)
y_train_baseline_t2v.to_csv('data/y_train_baseline_t2v.csv', index=False)
X_test_baseline_t2v.to_csv('data/X_test_baseline_t2v.csv', index=False)
y_test_baseline_t2v.to_csv('data/y_test_baseline_t2v.csv', index=False)

In [36]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE
from xgboost import XGBRegressor

model_baseline_t2v = RandomForestRegressor()

model_baseline_t2v.fit(X_train_baseline_t2v, y_train_baseline_t2v)
y_pred = model_baseline_t2v.predict(X_test_baseline_t2v)

MAE(y_test_baseline_t2v, y_pred)

0.9832571047499192

## Проверка стат.значимости результатов модели Baseline + TS2Vec

## Нулевая гипотеза: средние метрик для двух моделей статистически не различимы.

## Альтернативная гипотеза: средние метрик для двух моделей статистически различимы.

In [41]:
from scipy.stats import ttest_ind

from sklearn.metrics import mean_absolute_error

def predict_and_evaluate(model, X, y_true):
    """
    Делает предсказания с помощью модели и вычисляет MAE.
    
    Args:
        model: Обученная модель.
        X (array-like): Входные данные.
        y_true (array-like): Истинные значения.
    
    Returns:
        float: Значение MAE.
    """
    y_pred = model.predict(X)
    mae = mean_absolute_error(y_true, y_pred)
    return mae

def compare_model_performance(model1, model2, X1, X2, y_true1, y_true2, n=100, alpha=0.05):
    """
    Сравнивает производительность двух моделей на двух датасетах.
    
    Args:
        model1: Первая обученная модель.
        model2: Вторая обученная модель.
        X1 (array-like): Входные данные для первой модели.
        X2 (array-like): Входные данные для второй модели.
        y_true1 (array-like): Истинные значения для первой модели.
        y_true2 (array-like): Истинные значения для второй модели.
    
    Returns:
        tuple: Кортеж с двумя значениями MAE для каждой модели и p-value теста.
    """
    mae_list1 = []
    mae_list2 = []

    for i in range(n):
        n = len(X1)
        idx = np.random.choice(np.arange(n), size=n, replace=True)
        
        X1_bootst = X1[idx]
        y_true1_bootst = y_true1[idx]
        X2_bootst = X2[idx]
        y_true2_bootst = y_true2[idx]
        mae1 = predict_and_evaluate(model1, X1_bootst, y_true1_bootst)
        mae2 = predict_and_evaluate(model2, X2_bootst, y_true2_bootst)
        mae_list1.append(mae1)
        mae_list2.append(mae2)

    t_statistic, p_value = ttest_ind(mae_list1, mae_list2)
    print(f"Значение p-value: {p_value}")
    if p_value < alpha:
        print("Отвергаем нулевую гипотезу: средние значения различаются.")
    else:
        print("Не отвергаем нулевую гипотезу: нет статистически значимых различий в средних значениях.")

    return mae_list1, mae_list2, p_value

mae_model1, mae_model2, p_value = compare_model_performance(
                                                            model_baseline, 
                                                            model_baseline_t2v,
                                                            X_test_baseline.values,
                                                            X_test_baseline_t2v.values,
                                                            y_test_baseline.values,
                                                            y_test_baseline_t2v.values,
                                                            )
print("MAE для модели 1:", mae_model1)
print("MAE для модели 2:", mae_model2)



Значение p-value: 0.022341159516581423
Отвергаем нулевую гипотезу: средние значения различаются.
MAE для модели 1: [0.9023952590085632, 0.9728520593442314, 0.972404809350284, 1.0289250022319345, 0.9662125062856178, 0.9319881267017465, 0.8954322316054356, 0.9752574060641961, 0.9727847091971681, 0.9620545940213031, 0.9448002950674278, 0.9301763289415693, 0.9425346485217652, 1.0214564461884064, 0.9341324848342946, 0.9030547101541492, 0.899864733478808, 1.023503159739135, 1.0058858834154507, 0.9040289590353249, 0.9886136402157742, 1.0334716898169465, 1.030001628798413, 0.9345515602681909, 1.0248635291455979, 0.9304452261725715, 0.9603803414883407, 0.9864541567103424, 0.9647278250865176, 0.9103886972259732, 0.926977277691047, 0.9510962857466102, 0.8667122283600726, 0.8819611638841659, 0.9839692460212058, 1.0057822199435187, 1.0358988291662925, 0.9678313728790972, 0.9948161179943695, 1.0340925793394793, 0.9348641828877623, 0.9817384647766447, 0.8656324988071834, 1.0411768410081041, 0.9169129

In [43]:
np.mean(mae_model1), np.mean(mae_model2)

(0.9590380616269778, 0.9758146502532125)