In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml

from sklearn.metrics import mean_absolute_error as MAE
from matplotlib import pyplot as plt
from xgboost import XGBRegressor

# os.chdir('src/models/ts2vec_src')
from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

# os.chdir('../../..')

# Preprocessing

In [3]:
# data = read_data('data/all_tickers.csv')
# train_data = preprocessing(
#     data, 
#     ['Open', 'High', 'Low', 'Close', 'Volume'],         
#     start_date = '2023-12-15',
#     end_date = '2023-12-21'
# )

# test_data = preprocessing(
#     data, 
#     ['Open', 'High', 'Low', 'Close', 'Volume'],          
#     start_date = '2023-12-21',
#     end_date = '2023-12-22',
#     tickers_save = train_data['Close'].columns
# )

In [4]:
# for feature in ['Open', 'High', 'Low', 'Close', 'Volume']:
#     print(train_data['Close'].shape[1], test_data['Close'].shape[1])

In [5]:
# train_ts = data_to_np_tensor(train_data)
# test_ts = data_to_np_tensor(test_data)

In [6]:
# model = TS2Vec(
#     input_dims=train_ts.shape[2],
#     device=1,
#     output_dims=320
# )

# loss_log = model.fit(
#     train_ts,
#     verbose=False
# )

# test_repr = model.encode(test_ts)
# #test_repr = model.encode(test_data, encoding_window='full_series')

In [7]:
# test_repr = model.encode(test_ts, encoding_window='full_series')
# print(test_repr.shape)
# test_repr

# mean stock price prediction

In [177]:
with open('configs/best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())
# best_stocks

In [None]:
df = read_data('data/all_tickers.csv')
df_best = df.query("Stock in @best_stocks")

In [None]:
df_best

In [None]:
mask = df['Stock'].apply(lambda x: x in best_stocks) 

In [None]:
df[mask]

In [None]:
dd = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [None]:
df_best_h = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(dd)

df_best_h

In [None]:
df = df_best_h.groupby('Stock').pct_change().reset_index()

In [None]:
df.head()

In [None]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'


train_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],         
    start_date = train_start,
    end_date = train_end,
    tickers_save = best_stocks
)

test_data = preprocess_split(
    df, 
    ['Open', 'High', 'Low', 'Close', 'Volume'],          
    start_date = test_start,
    end_date = test_end,
    tickers_save = best_stocks
)

In [None]:
train_ts = data_to_np_tensor(train_data)
test_ts = data_to_np_tensor(test_data)

train_ts.shape, test_ts.shape

In [None]:
model = TS2Vec(
    input_dims=train_ts.shape[2],
    device='cpu', ###### 2
    output_dims=128
)

loss_log = model.fit(
    train_ts,
    verbose=False
)

In [None]:
train_repr = model.encode(train_ts)
test_repr = model.encode(test_ts)

train_repr.shape, test_repr.shape

In [None]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

X_train = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train.head()

In [None]:
X_emd = pd.concat([X_train, X_test])

In [None]:
# X_test

In [None]:
y = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
y.head()

In [None]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [None]:
y_emd = pd.concat([y_train, y_test])

In [None]:
y_emd

In [None]:
# X_emd.to_csv('results/TS2Vec/x_emb.csv', index=False)
# y_emd.to_csv('results/TS2Vec/y.csv', index=False)

In [None]:
y_name = 'Close'

df_train = pd.merge(X_train, y_train, on=['Stock', 'Datetime'])
df_test = pd.merge(X_test, y_test, on=['Stock', 'Datetime'])

X_train, y_train = df_train.drop(columns=[y_name, 'Datetime', 'Stock']), df_train[y_name]
X_test, y_test = df_test.drop(columns=[y_name, 'Datetime', 'Stock']), df_test[y_name]

In [None]:
model = XGBRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

MAE(y_test, y_pred)

# baseline

In [None]:
X_baseline = df.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
X_baseline.head()

In [None]:
orig_vals = df_best_h.reset_index()

In [None]:
n_shifts = 18
for i in range(1, n_shifts + 1):
    X_baseline[f'shift_{i}'] = X_baseline.groupby(by=['Stock']).shift(i)['Close']

X_baseline = X_baseline.dropna()
y = X_baseline.loc[:, :'Close'] 

X_baseline = X_baseline.drop(columns='Close')
X_baseline.head()

In [None]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]['Close']

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]['Close']

X_train = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(train_end).date())]

X_test = X_baseline[(X_baseline['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (X_baseline['Datetime'].dt.date < pd.Timestamp(test_end).date())]

last_train_date = X_train.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
y_start_test.head()

In [None]:
X_train.head()

In [None]:
model = XGBRegressor()

model.fit(X_train.drop(['Stock', 'Datetime'], axis=1), y_train)
y_pred = model.predict(X_test.drop(['Stock', 'Datetime'], axis=1))

MAE(y_test, y_pred)

In [None]:
df_preds = X_test[['Stock', 'Datetime']].copy()
df_preds['Preds'] = y_pred + 1
df_preds['Close'] = y_test + 1
df_preds.head()

In [None]:
starts = y_start_test.sort_values('Stock')['Close'].values

In [None]:
pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'])
orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'])

In [None]:
pred_close.head()

## Baseline + TS2Vec

In [None]:
shifts = X_train

In [None]:
shifts_test = X_test

In [None]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

X_train = stock_embeddigns_to_df(train_repr, stocks=train_data['Open'].columns, dates=train_data['Open'].index)
X_test = stock_embeddigns_to_df(test_repr, stocks=test_data['Open'].columns, dates=test_data['Open'].index)
X_train.head()

In [None]:
y = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper( freq='h')],
).agg({'Close': 'mean'}).reset_index()
y.head()

In [None]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
             (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

In [None]:
y_name = 'Close'

df_train = pd.merge(X_train, y_train, on=['Stock', 'Datetime'])
df_test = pd.merge(X_test, y_test, on=['Stock', 'Datetime'])

X_train, y_train = df_train.drop(columns=[y_name, 'Datetime', 'Stock']), df_train[y_name]
X_test, y_test = df_test.drop(columns=[y_name, 'Datetime', 'Stock']), df_test[y_name]

In [None]:
X_train = pd.concat([X_train, shifts], axis=1)
X_test = pd.concat([X_test, shifts_test], axis=1)

In [None]:
X_train

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as MAE
from xgboost import XGBRegressor

model = XGBRegressor()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

MAE(y_test, y_pred)