In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml

from sklearn.metrics import mean_absolute_error as MAE
from matplotlib import pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

os.chdir('src/models/ts2vec_src')

from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

os.chdir('../../..')

  from .autonotebook import tqdm as notebook_tqdm


# Functions

In [3]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

# Config

In [4]:
with open('configs/best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())
# best_stocks

In [5]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [6]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [7]:
use_pct_changes = True

In [8]:
ts2vec_device = 2
ts2vec_out_dim = 128

n_shifts = 18

# DataLoading

In [9]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")
df_best.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Datetime,Stock,Day_week
61,2024-01-30,22:58:00,188.13,188.17,188.11,188.14,1500,2024-01-30 22:58:00,AAPL,Tuesday
62,2024-01-30,22:57:00,188.22,188.22,188.08,188.11,5700,2024-01-30 22:57:00,AAPL,Tuesday
63,2024-01-30,22:56:00,188.22,188.27,188.215,188.22,2600,2024-01-30 22:56:00,AAPL,Tuesday
64,2024-01-30,22:55:00,188.15,188.2,188.14,188.2,2100,2024-01-30 22:55:00,AAPL,Tuesday
65,2024-01-30,22:54:00,188.18,188.18,188.14,188.18,2013,2024-01-30 22:54:00,AAPL,Tuesday


# Preprocessing

In [10]:
df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [11]:
df_original = df_agg.reset_index()

if use_pct_changes:
    df = df_agg.groupby('Stock').pct_change().reset_index()
else:
    df = df_original
df.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,,,,,
1,AAPL,2023-01-30 18:00:00,-0.00813,-0.001374,-0.005549,-0.001171,0.196849
2,AAPL,2023-01-30 19:00:00,0.000695,-0.007087,0.000837,-0.006967,-0.382496
3,AAPL,2023-01-30 20:00:00,-0.004789,-0.000416,-0.00223,0.001528,0.183683
4,AAPL,2023-01-30 21:00:00,-0.000488,-0.002981,-0.001187,-0.00548,-0.245468


In [12]:
y = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [13]:
train_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],         
    start_date = train_start,
    end_date = train_end,
    tickers_save = best_stocks
)

test_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],          
    start_date = test_start,
    end_date = test_end,
    tickers_save = best_stocks
)

NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


## TS2Vec

In [14]:
train_ts = data_to_np_tensor(train_data)
test_ts = data_to_np_tensor(test_data)

train_ts.shape, test_ts.shape

((28, 132, 5), (28, 24, 5))

In [15]:
model = TS2Vec(
    input_dims=train_ts.shape[2],
    device=ts2vec_device,
    output_dims=ts2vec_out_dim,
)

loss_log = model.fit(
    train_ts,
    verbose=False
)

In [16]:
train_repr = model.encode(train_ts)
test_repr = model.encode(test_ts)

train_repr.shape, test_repr.shape

((28, 132, 128), (28, 24, 128))

In [17]:
X_train_emb = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test_emb = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train_emb.head()

Unnamed: 0,Datetime,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,Stock
0,2023-10-02 17:00:00,-0.21658,0.221275,0.00449,-0.265021,-0.070393,-0.413131,-0.115951,-0.280331,0.022587,...,-0.065027,-0.377902,-0.488676,-0.204773,-0.110465,-0.342988,-0.067913,-0.031168,-0.228472,AAPL
1,2023-10-02 18:00:00,-0.105324,0.177931,-0.35241,-0.117561,-0.171935,-0.086188,-0.335002,0.05581,-0.070756,...,-0.453445,-0.120337,-0.358686,-0.112001,-0.009213,-0.149452,-0.455057,-0.12059,-0.263173,AAPL
2,2023-10-02 19:00:00,-0.239452,0.252011,-0.12363,-0.094002,-0.078686,-0.395943,-0.186552,-0.180682,-0.022542,...,-0.213787,-0.46938,-0.305106,-0.139599,0.039231,-0.358953,-0.283657,-0.124956,-0.277024,AAPL
3,2023-10-02 20:00:00,-0.088895,0.218046,-0.192343,-0.07923,-0.058346,-0.381254,-0.516352,-0.142348,-0.006986,...,-0.238418,-0.283001,-0.317542,-0.110662,-0.051401,-0.396073,-0.228332,-0.024556,-0.126471,AAPL
4,2023-10-02 21:00:00,-0.325665,0.187346,-0.158165,-0.335352,-0.089494,-0.093007,-0.006304,-0.138567,0.007509,...,-0.247017,-0.254334,-0.418961,-0.284832,-0.149284,-0.141865,-0.272693,-0.171755,-0.329874,AAPL


In [18]:
X_emb = pd.concat([X_train_emb, X_test_emb])
y = pd.concat([y_train, y_test])

X_emb.to_csv('results/TS2Vec/x_emb.csv', index=False)
y.to_csv('results/TS2Vec/y.csv', index=False)

## baseline

In [19]:
X_baseline = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
X_baseline.head()

Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,
1,AAPL,2023-01-30 18:00:00,-0.001171
2,AAPL,2023-01-30 19:00:00,-0.006967
3,AAPL,2023-01-30 20:00:00,0.001528
4,AAPL,2023-01-30 21:00:00,-0.00548


In [20]:
for i in range(1, n_shifts + 1):
    X_baseline[f'shift_{i}'] = X_baseline.groupby(by=['Stock']).shift(i)['Close']

X_baseline = X_baseline.dropna()
X_baseline = X_baseline.drop(columns='Close')
X_baseline.head()

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
19,AAPL,2023-02-02 18:00:00,0.052279,-0.001548,-0.001756,-0.001332,-0.0021,-0.006812,0.001392,0.001464,0.001186,0.000489,0.005476,-0.00014,-0.006001,-0.000418,-0.00548,0.001528,-0.006967,-0.001171
20,AAPL,2023-02-02 19:00:00,-0.003817,0.052279,-0.001548,-0.001756,-0.001332,-0.0021,-0.006812,0.001392,0.001464,0.001186,0.000489,0.005476,-0.00014,-0.006001,-0.000418,-0.00548,0.001528,-0.006967
21,AAPL,2023-02-02 20:00:00,0.008334,-0.003817,0.052279,-0.001548,-0.001756,-0.001332,-0.0021,-0.006812,0.001392,0.001464,0.001186,0.000489,0.005476,-0.00014,-0.006001,-0.000418,-0.00548,0.001528
22,AAPL,2023-02-02 21:00:00,-0.0014,0.008334,-0.003817,0.052279,-0.001548,-0.001756,-0.001332,-0.0021,-0.006812,0.001392,0.001464,0.001186,0.000489,0.005476,-0.00014,-0.006001,-0.000418,-0.00548
23,AAPL,2023-02-02 22:00:00,0.001602,-0.0014,0.008334,-0.003817,0.052279,-0.001548,-0.001756,-0.001332,-0.0021,-0.006812,0.001392,0.001464,0.001186,0.000489,0.005476,-0.00014,-0.006001,-0.000418


In [21]:
X_train_base = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(train_end).date())]

X_test_base = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [22]:
orig_vals = df_original.reset_index()

last_train_date = X_train_base.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
y_start_test.head()

Unnamed: 0,index,Stock,Datetime,Open,High,Low,Close,Volume
0,1142,AAPL,2023-10-31 22:00:00,170.85,170.9,170.385,170.62,338198
1,2649,ABBV,2023-10-31 22:00:00,141.42,141.65,139.91,140.19,86557
2,4156,ABT,2023-10-31 22:00:00,94.53,94.88,94.35,94.84,56242
3,5663,AMD,2023-10-31 22:00:00,98.49,98.8,98.1,98.44,386922
4,7170,BAC,2023-10-31 22:00:00,26.355,26.375,26.275,26.28,579334


## ts2vec + baseline

In [23]:
X_train_bs_emb = pd.merge(X_train_base, X_train_emb, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb = pd.merge(X_test_base, X_test_emb, on=['Stock', 'Datetime'], how='inner')

# Prediction

In [24]:
datasets = {
    'emb': {
        'X_train': X_train_emb.drop(columns=['Datetime', 'Stock']), 
        'X_test': X_test_emb.drop(columns=['Datetime', 'Stock']), 
        'y_train': y_train['Close'], 
        'y_test': y_test['Close'],
    },
    'base': {
        'X_train': X_train_base.drop(columns=['Datetime', 'Stock']), 
        'X_test': X_test_base.drop(columns=['Datetime', 'Stock']), 
        'y_train': y_train['Close'], 
        'y_test': y_test['Close'],
    },
    'emb_base': {
        'X_train': X_train_bs_emb.drop(columns=['Datetime', 'Stock']), 
        'X_test': X_test_bs_emb.drop(columns=['Datetime', 'Stock']), 
        'y_train': y_train['Close'],  
        'y_test': y_test['Close'],
    },
}


models = {
    'lin_reg': LinearRegression(),
    # 'xgb': XGBRegressor(),
    # 'rf': RandomForestRegressor(),
}

In [25]:
df_results = pd.DataFrame([])

In [26]:
for ds_name, data in datasets.items():
    for model_name, model in models.items():
        print(ds_name, model_name)
        model.fit(data['X_train'], data['y_train'])
        y_pred = model.predict(data['X_test'])

        metric = MAE(data['y_test'], y_pred)
        print('MAE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results = pd.concat([df_results, metrics_df])

df_results = df_results.reset_index()

emb lin_reg


MAE:  0.0037685894641700723
base lin_reg
MAE:  0.0037558855225526175
emb_base lin_reg
MAE:  0.003856938272291162


In [27]:
df_results.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lin_reg,0.003756,0.003769,0.003857


# ARXIEV: TO DO add metrics

In [28]:
df_preds = X_test[['Stock', 'Datetime']].copy()
df_preds['Preds'] = y_pred + 1
df_preds['Close'] = y_test + 1
df_preds.head()

NameError: name 'X_test' is not defined

In [None]:
starts = y_start_test.sort_values('Stock')['Close'].values

In [None]:
pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'])
orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'])

In [None]:
pred_close.head()

Unnamed: 0,Datetime,Stock,value
0,2023-11-01 17:00:00,AAPL,171.301292
1,2023-11-01 18:00:00,AAPL,171.240863
2,2023-11-01 19:00:00,AAPL,171.424447
3,2023-11-01 20:00:00,AAPL,171.230469
4,2023-11-01 21:00:00,AAPL,171.136094


In [None]:
orig_close.head()

Unnamed: 0,Datetime,Stock,value
0,2023-11-01 17:00:00,AAPL,171.85
1,2023-11-01 18:00:00,AAPL,171.42
2,2023-11-01 19:00:00,AAPL,171.92
3,2023-11-01 20:00:00,AAPL,171.97
4,2023-11-01 21:00:00,AAPL,172.43
