In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from matplotlib import pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

os.chdir('src/models/ts2vec_src')

from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

os.chdir('../../..')

  from .autonotebook import tqdm as notebook_tqdm


# Functions

In [3]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')



# Config

In [4]:
with open('configs/best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())
# best_stocks

In [5]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [6]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [7]:
use_pct_changes = False

In [8]:
ts2vec_device = 2
ts2vec_out_dim = 128

n_shifts = 18

# DataLoading

In [9]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")
df_best.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Datetime,Stock,Day_week
61,2024-01-30,22:58:00,188.13,188.17,188.11,188.14,1500,2024-01-30 22:58:00,AAPL,Tuesday
62,2024-01-30,22:57:00,188.22,188.22,188.08,188.11,5700,2024-01-30 22:57:00,AAPL,Tuesday
63,2024-01-30,22:56:00,188.22,188.27,188.215,188.22,2600,2024-01-30 22:56:00,AAPL,Tuesday
64,2024-01-30,22:55:00,188.15,188.2,188.14,188.2,2100,2024-01-30 22:55:00,AAPL,Tuesday
65,2024-01-30,22:54:00,188.18,188.18,188.14,188.18,2013,2024-01-30 22:54:00,AAPL,Tuesday


# Preprocessing

In [10]:
df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [11]:
df_original = df_agg.reset_index()

if use_pct_changes:
    df = df_agg.groupby('Stock').pct_change().reset_index()
else:
    df = df_original
df.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [12]:
y = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [13]:
train_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],         
    start_date = train_start,
    end_date = train_end,
    tickers_save = best_stocks
)

test_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],          
    start_date = test_start,
    end_date = test_end,
    tickers_save = best_stocks
)

NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


## TS2Vec

In [14]:
train_ts = data_to_np_tensor(train_data)
test_ts = data_to_np_tensor(test_data)

train_ts.shape, test_ts.shape

((28, 132, 5), (28, 24, 5))

In [15]:
model = TS2Vec(
    input_dims=train_ts.shape[2],
    device=ts2vec_device,
    output_dims=ts2vec_out_dim,
)

loss_log = model.fit(
    train_ts,
    verbose=False
)

In [16]:
train_repr = model.encode(train_ts)
test_repr = model.encode(test_ts)

train_repr.shape, test_repr.shape

((28, 132, 128), (28, 24, 128))

In [17]:
X_train_emb = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test_emb = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train_emb.head()

Unnamed: 0,Datetime,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,Stock
0,2023-10-02 17:00:00,-1134.575439,3511.307617,-2438.772705,-4885.083984,-1966.540527,-1790.822754,-1190.49292,-867.698486,-3469.033203,...,-2025.104736,-3993.119873,-5386.396484,-6245.114258,-1504.494629,372.939087,-425.033203,-5172.709473,-2401.736084,AAPL
1,2023-10-02 18:00:00,-4611.337891,4517.751953,-8415.246094,-3623.351807,-2352.958984,320.557678,-3800.040771,-1678.055664,-7006.621582,...,-4581.333008,-6194.522461,-7788.831055,-5030.794434,-3742.253662,-3388.576416,2001.724121,-1957.170288,-2886.735107,AAPL
2,2023-10-02 19:00:00,-422.289612,2018.228027,-4399.97998,-2130.789062,-2340.783936,487.752441,-1706.715698,-2998.628906,-2990.81958,...,-2845.390137,-3337.228271,-4053.755371,-2905.787598,-1049.087402,1077.893433,677.294556,-3197.78833,-3295.083496,AAPL
3,2023-10-02 20:00:00,-1938.690308,1940.564819,-5421.805664,-3896.167969,907.119751,-209.622742,-1841.934814,-5404.314453,-4773.838867,...,-3238.296631,-6915.606445,-4028.384277,-5449.882324,-1478.38623,-1272.773438,-973.62738,-2666.458008,-4807.170898,AAPL
4,2023-10-02 21:00:00,-1296.092773,1000.00946,-3853.662598,-1414.926025,-2927.648682,-1315.63208,-3638.165527,-3610.118652,-1765.083984,...,-1672.01001,-2308.138428,-1942.726318,-2254.842529,-2312.055664,-785.482056,-2115.631348,-2081.612793,-3658.485596,AAPL


In [18]:
X_emb = pd.concat([X_train_emb, X_test_emb])
y = pd.concat([y_train, y_test])

X_emb.to_csv('results/TS2Vec/x_emb.csv', index=False)
y.to_csv('results/TS2Vec/y.csv', index=False)

## baseline

In [19]:
X_baseline = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
X_baseline.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,145.13
1,AAPL,2023-01-30 18:00:00,144.96
2,AAPL,2023-01-30 19:00:00,143.95
3,AAPL,2023-01-30 20:00:00,144.17
4,AAPL,2023-01-30 21:00:00,143.38


In [20]:
for i in range(1, n_shifts + 1):
    X_baseline[f'shift_{i}'] = X_baseline.groupby(by=['Stock']).shift(i)['Close']

X_baseline = X_baseline.dropna()
X_baseline = X_baseline.drop(columns='Close')
X_baseline.head()

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
18,AAPL,2023-02-02 17:00:00,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96,145.13
19,AAPL,2023-02-02 18:00:00,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96
20,AAPL,2023-02-02 19:00:00,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95
21,AAPL,2023-02-02 20:00:00,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17
22,AAPL,2023-02-02 21:00:00,149.81,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38


In [21]:
X_train_base = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(train_end).date())]

X_test_base = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [22]:
orig_vals = df_original.reset_index()

last_train_date = X_train_base.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
y_start_test.head()

Unnamed: 0,index,Stock,Datetime,Open,High,Low,Close,Volume
0,1142,AAPL,2023-10-31 22:00:00,170.85,170.9,170.385,170.62,338198
1,2649,ABBV,2023-10-31 22:00:00,141.42,141.65,139.91,140.19,86557
2,4156,ABT,2023-10-31 22:00:00,94.53,94.88,94.35,94.84,56242
3,5663,AMD,2023-10-31 22:00:00,98.49,98.8,98.1,98.44,386922
4,7170,BAC,2023-10-31 22:00:00,26.355,26.375,26.275,26.28,579334


## ts2vec + baseline

In [23]:
X_train_bs_emb = pd.merge(X_train_base, X_train_emb, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb = pd.merge(X_test_base, X_test_emb, on=['Stock', 'Datetime'], how='inner')

# Prediction

In [24]:
datasets = {
    'emb': {
        'X_train': X_train_emb.reset_index(drop=True), 
        'X_test': X_test_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base.reset_index(drop=True), 
        'X_test': X_test_base.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb.reset_index(drop=True), 
        'X_test': X_test_bs_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True),  
        'y_test': y_test['Close'].reset_index(drop=True),
    },
}


models = {
    'lin_reg': LinearRegression(),
    'xgb': XGBRegressor(),
    # 'rf': RandomForestRegressor(),
}

In [25]:
df_results = pd.DataFrame([])

In [26]:
def estimate_result(y_test, y_pred, X_test=None, y_start_test=None, metric_func=MAE, pct_change=True):
    if not pct_change:
        return metric_func(y_test, y_pred)
    
    df_preds = X_test.copy()
    df_preds['Preds'] = y_pred + 1
    df_preds['Close'] = y_test.reset_index(drop=True) + 1
    
    starts = y_start_test.sort_values('Stock')['Close'].values

    pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
    orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

    pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'], value_name='Pred')
    orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'], value_name='True')

    metric_df = pd.merge(pred_close, orig_close, how='inner', on=['Stock', 'Datetime'])

    return metric_func(metric_df['True'], metric_df['Pred'])

In [27]:
for ds_name, data in datasets.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=use_pct_changes)
        print('MAE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results = pd.concat([df_results, metrics_df])

df_results = df_results.reset_index()

emb lin_reg
MAE:  1.163528314669423
emb xgb


MAE:  0.8408244713522076
base lin_reg
MAE:  0.003795196794834198
base xgb
MAE:  0.020051725847424068
emb_base lin_reg
MAE:  0.005916469312356028
emb_base xgb
MAE:  0.02124070982733863


In [28]:
df_results.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lin_reg,0.003795,1.163528,0.005916
xgb,0.020052,0.840824,0.021241
