In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import yaml

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from matplotlib import pyplot as plt
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

# Functions

In [3]:
def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
    res = pd.DataFrame()
    for i, stock in enumerate(stocks):
        df = pd.DataFrame(data_tensor[i], index=dates)
        df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
        df['Stock'] = stock
        res = pd.concat([res, df])

    return res.reset_index(drop=False, names='Datetime')

def estimate_result(y_test, y_pred, X_test=None, y_start_test=None, metric_func=MAE, pct_change=True):
    if not pct_change:
        return metric_func(y_test, y_pred)
    
    df_preds = X_test.copy()
    df_preds['Preds'] = y_pred + 1
    df_preds['Close'] = y_test.reset_index(drop=True) + 1
    
    starts = y_start_test.sort_values('Stock')['Close'].values

    pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
    orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

    pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'], value_name='Pred')
    orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'], value_name='True')

    metric_df = pd.merge(pred_close, orig_close, how='inner', on=['Stock', 'Datetime'])

    return metric_func(metric_df['True'], metric_df['Pred'])

# Config

In [4]:
with open('configs//best_stocks_nans_rate.yaml') as f:
    best_stocks = yaml.load(f, Loader=yaml.FullLoader)
best_stocks = list(best_stocks.keys())
# best_stocks

In [5]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [6]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [7]:
use_pct_changes_X = True
use_pct_changes_ts2v = True
use_pct_changes_labels = True

In [8]:
ts2vec_device = 2
ts2vec_out_dim = 128

n_shifts = 18

# DataLoading

In [9]:
df = read_data('..//data//all_tickers.csv')
df_best = df.query("Stock in @best_stocks")
df_best.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Datetime,Stock,Day_week
61,2024-01-30,22:58:00,188.13,188.17,188.11,188.14,1500,2024-01-30 22:58:00,AAPL,Tuesday
62,2024-01-30,22:57:00,188.22,188.22,188.08,188.11,5700,2024-01-30 22:57:00,AAPL,Tuesday
63,2024-01-30,22:56:00,188.22,188.27,188.215,188.22,2600,2024-01-30 22:56:00,AAPL,Tuesday
64,2024-01-30,22:55:00,188.15,188.2,188.14,188.2,2100,2024-01-30 22:55:00,AAPL,Tuesday
65,2024-01-30,22:54:00,188.18,188.18,188.14,188.18,2013,2024-01-30 22:54:00,AAPL,Tuesday


# Preprocessing

In [10]:
df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [11]:
df_original = df_agg.reset_index()

df_use_pct = df_agg.groupby('Stock').pct_change().reset_index()
df_standart = df_original
df_standart.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [14]:
def process_time_labels(df, train_start, train_end, test_start, test_end):
    y = df.set_index('Datetime').groupby(
        ['Stock', pd.Grouper(freq='h')],
    ).agg({'Close': 'mean'}).reset_index()

    y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
                (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

    y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
               (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

    return y_train, y_test

y_train_standart, y_test_standart = process_time_labels(df_standart, train_start, train_end, test_start, test_end)
y_train_use_pct, y_test_use_pct = process_time_labels(df_use_pct, train_start, train_end, test_start, test_end)


In [15]:
def preprocess_data(df, columns, start_date, end_date, tickers_save):
    data = preprocess_split(df, columns, start_date=start_date, end_date=end_date, tickers_save=tickers_save)
    return data

train_data_standart = preprocess_data(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks)
test_data_standart = preprocess_data(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks)

train_data_use_pct = preprocess_data(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks)
test_data_use_pct = preprocess_data(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks)


NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


## TS2Vec

In [16]:
train_ts_standart = data_to_np_tensor(train_data_standart)
test_ts_standart = data_to_np_tensor(test_data_standart)

train_ts_use_pct = data_to_np_tensor(train_data_use_pct)
test_ts_use_pct = data_to_np_tensor(test_data_use_pct)
train_ts_standart.shape, test_ts_standart.shape

((28, 132, 5), (28, 24, 5))

In [17]:
def train_model_ts(train_data, input_dims, output_dims, device='cpu'):
    model = TS2Vec(
        input_dims=input_dims,
        device=device,
        output_dims=output_dims,
    )

    loss_log = model.fit(
        train_data,
        verbose=False
    )

    return model

model_standart = train_model_ts(train_ts_standart, train_ts_standart.shape[2], ts2vec_out_dim)
model_use_pct = train_model_ts(train_ts_use_pct, train_ts_use_pct.shape[2], ts2vec_out_dim)


In [18]:
train_repr_standart = model_standart.encode(train_ts_standart)
test_repr_standart = model_standart.encode(test_ts_standart)

train_repr_use_pct = model_use_pct.encode(train_ts_use_pct)
test_repr_use_pct = model_use_pct.encode(test_ts_use_pct)
train_repr_standart.shape, test_repr_standart.shape

((28, 132, 128), (28, 24, 128))

In [19]:
X_train_emb_standart = stock_embeddigns_to_df(train_repr_standart, stocks=train_data_standart['Open'].columns, dates=train_data_standart['Open'].index)
X_test_emb_standart = stock_embeddigns_to_df(test_repr_standart, stocks=test_data_standart['Open'].columns, dates=test_data_standart['Open'].index)

X_train_emb_use_pct = stock_embeddigns_to_df(train_repr_use_pct, stocks=train_data_use_pct['Open'].columns, dates=train_data_use_pct['Open'].index)
X_test_emb_use_pct = stock_embeddigns_to_df(test_repr_use_pct, stocks=test_data_use_pct['Open'].columns, dates=test_data_use_pct['Open'].index)

X_train_emb_standart.head()

Unnamed: 0,Datetime,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127,Stock
0,2023-10-02 17:00:00,-1031.873779,-4263.572266,-3826.361328,-1953.775391,1744.000977,1010.236206,-2885.345703,-1989.422974,-4221.499512,...,-6783.26123,26.029419,-2835.438965,-5776.208984,-6703.150391,696.581055,266.898865,-6850.191406,-6024.348633,AAPL
1,2023-10-02 18:00:00,1845.30127,-9397.713867,-1426.333618,-5361.711914,-2643.597168,-151.513153,-3986.955566,-2475.558838,-9099.931641,...,-12616.529297,-2496.000732,-7276.241211,1314.555298,-5880.163574,-2607.677734,3856.184082,-3025.46167,-8529.393555,AAPL
2,2023-10-02 19:00:00,-3376.613525,-2311.700684,-4295.290527,-5942.65625,690.421265,-176.720566,-2794.658447,-1000.077026,-2048.559082,...,-7133.961914,-4560.009277,-2460.144043,-57.214111,-7890.020508,-3293.272461,-1937.693359,-2563.38916,-2453.01709,AAPL
3,2023-10-02 20:00:00,-3545.384766,-2662.816895,-12228.433594,-3531.479004,2877.416748,-731.02063,-4426.236328,206.354004,-4854.22998,...,-7000.032227,-4787.684082,-2139.066406,-3491.125,-7959.358398,-6951.423828,-2818.387207,-5730.43457,-5022.405273,AAPL
4,2023-10-02 21:00:00,-3480.712402,-833.38208,-3685.35498,-2098.428223,-1041.01416,-264.039978,-2495.63501,-1303.707886,-6130.390625,...,-9517.133789,-6043.02002,-4912.323242,-5424.580566,-7298.611816,-946.424255,1214.37146,-3034.701172,-1312.567383,AAPL


In [18]:
X_emb = pd.concat([X_train_emb_, X_test_emb])
y = pd.concat([y_train, y_test])

X_emb.to_csv('..//Stock_Embedding//data//TS2Vec//x_emb_pct_change.csv', index=False)
y.to_csv('..//Stock_Embedding//data//TS2Vec//y_pct_change.csv', index=False)

## baseline

In [21]:
def calculate_mean_close(df):
    X_baseline = df.set_index('Datetime').groupby(
        ['Stock', pd.Grouper(freq='h')],
    ).agg({'Close': 'mean'}).reset_index()
    return X_baseline

X_baseline_standart = calculate_mean_close(df_standart)
X_baseline_use_pct = calculate_mean_close(df_use_pct)

X_baseline_use_pct.head()


Unnamed: 0,Stock,Datetime,Close
0,AAPL,2023-01-30 17:00:00,
1,AAPL,2023-01-30 18:00:00,-0.001171
2,AAPL,2023-01-30 19:00:00,-0.006967
3,AAPL,2023-01-30 20:00:00,0.001528
4,AAPL,2023-01-30 21:00:00,-0.00548


In [22]:
def add_shifts(df, n_shifts):
    df_copy = df.copy()
    for i in range(1, n_shifts + 1):
        df_copy[f'shift_{i}'] = df_copy.groupby(by=['Stock']).shift(i)['Close']
    df_copy = df_copy.dropna().drop(columns='Close')
    return df_copy

X_baseline_standart = add_shifts(X_baseline_standart, n_shifts)
X_baseline_use_pct = add_shifts(X_baseline_use_pct, n_shifts)
X_baseline_standart.head()

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
18,AAPL,2023-02-02 17:00:00,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96,145.13
19,AAPL,2023-02-02 18:00:00,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96
20,AAPL,2023-02-02 19:00:00,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95
21,AAPL,2023-02-02 20:00:00,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17
22,AAPL,2023-02-02 21:00:00,149.81,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38


In [23]:
def filter_by_date_range(df, start_date, end_date):
    filtered_df = df[(df['Datetime'].dt.date >= pd.Timestamp(start_date).date()) & 
                     (df['Datetime'].dt.date < pd.Timestamp(end_date).date())]
    return filtered_df


X_train_base_standart = filter_by_date_range(X_baseline_standart, train_start, train_end)
X_test_base_standart = filter_by_date_range(X_baseline_standart, test_start, test_end)

X_train_base_use_pct = filter_by_date_range(X_baseline_use_pct, train_start, train_end)
X_test_base_use_pct = filter_by_date_range(X_baseline_use_pct, test_start, test_end)

In [24]:
def merge_data(df_original, X_train_base, column_name):
    orig_vals = df_original.reset_index()
    
    last_train_date = X_train_base.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
    
    y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
    
    return y_start_test


y_start_test_standart = merge_data(df_original, X_train_base_standart, 'Stock')

y_start_test_use_pct = merge_data(df_original, X_train_base_use_pct, 'Stock')


y_start_test_standart.head()


Unnamed: 0,index,Stock,Datetime,Open,High,Low,Close,Volume
0,1142,AAPL,2023-10-31 22:00:00,170.85,170.9,170.385,170.62,338198
1,2649,ABBV,2023-10-31 22:00:00,141.42,141.65,139.91,140.19,86557
2,4156,ABT,2023-10-31 22:00:00,94.53,94.88,94.35,94.84,56242
3,5663,AMD,2023-10-31 22:00:00,98.49,98.8,98.1,98.44,386922
4,7170,BAC,2023-10-31 22:00:00,26.355,26.375,26.275,26.28,579334


## ts2vec + baseline

In [25]:
X_train_bs_emb_standart = pd.merge(X_train_base_standart, X_train_emb_standart, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb_standart = pd.merge(X_test_base_standart, X_test_emb_standart, on=['Stock', 'Datetime'], how='inner')

X_train_bs_emb_use_pct = pd.merge(X_train_base_use_pct, X_train_emb_use_pct, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb_use_pct = pd.merge(X_test_base_use_pct, X_test_emb_use_pct, on=['Stock', 'Datetime'], how='inner')

# Prediction 

In [33]:
datasets_standart = {
    'emb': {
        'X_train': X_train_emb_standart.reset_index(drop=True), 
        'X_test': X_test_emb_standart.reset_index(drop=True), 
        'y_train': y_train_standart['Close'].reset_index(drop=True), 
        'y_test': y_test_standart['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base_standart.reset_index(drop=True), 
        'X_test': X_test_base_standart.reset_index(drop=True), 
        'y_train': y_train_standart['Close'].reset_index(drop=True), 
        'y_test': y_test_standart['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb_standart.reset_index(drop=True), 
        'X_test': X_test_bs_emb_standart.reset_index(drop=True), 
        'y_train': y_train_standart['Close'].reset_index(drop=True),  
        'y_test': y_test_standart['Close'].reset_index(drop=True),
    },
}


models = {
    'lin_reg': LinearRegression(),
    'ctb': CatBoostRegressor(),
    'rf': RandomForestRegressor(),
    'knn' : KNeighborsRegressor(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'svr' : SVR(),
    'xgb' : Xg
    
}

In [34]:
df_results_standart = pd.DataFrame([])

In [35]:
for ds_name, data in datasets_standart.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test_standart, metric_func=MAPE, pct_change=False)
        print('MAPE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results_standart = pd.concat([df_results_standart, metrics_df])

df_results_standart = df_results_standart.reset_index()

emb lin_reg
MAPE:  0.7547931663350557
emb ctb
Learning rate set to 0.050336
0:	learn: 102.8726279	total: 168ms	remaining: 2m 47s
1:	learn: 102.2252226	total: 213ms	remaining: 1m 46s
2:	learn: 101.6436108	total: 223ms	remaining: 1m 14s
3:	learn: 101.1083966	total: 233ms	remaining: 58s
4:	learn: 100.5997587	total: 243ms	remaining: 48.4s
5:	learn: 99.9697065	total: 256ms	remaining: 42.4s
6:	learn: 99.3157430	total: 265ms	remaining: 37.6s
7:	learn: 98.8355502	total: 275ms	remaining: 34.1s
8:	learn: 98.3226568	total: 286ms	remaining: 31.5s
9:	learn: 97.8277720	total: 298ms	remaining: 29.5s
10:	learn: 97.4195981	total: 310ms	remaining: 27.8s
11:	learn: 96.9836793	total: 319ms	remaining: 26.3s
12:	learn: 96.4993476	total: 327ms	remaining: 24.8s
13:	learn: 96.0161232	total: 335ms	remaining: 23.6s
14:	learn: 95.5480918	total: 343ms	remaining: 22.5s
15:	learn: 95.0934490	total: 353ms	remaining: 21.7s
16:	learn: 94.6315828	total: 361ms	remaining: 20.9s
17:	learn: 94.2497863	total: 370ms	remaining

  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


MAPE:  0.7106669928865676
base lin_reg
MAPE:  0.0037951967948342182
base ctb
Learning rate set to 0.050336
0:	learn: 98.5503899	total: 2.47ms	remaining: 2.47s
1:	learn: 93.7981900	total: 4.24ms	remaining: 2.11s
2:	learn: 89.2930999	total: 6.26ms	remaining: 2.08s
3:	learn: 85.0748620	total: 7.96ms	remaining: 1.98s
4:	learn: 80.9919011	total: 9.66ms	remaining: 1.92s
5:	learn: 77.1375725	total: 11.3ms	remaining: 1.87s
6:	learn: 73.4644262	total: 13.3ms	remaining: 1.89s
7:	learn: 69.9751295	total: 14.9ms	remaining: 1.85s
8:	learn: 66.6227426	total: 16.6ms	remaining: 1.82s
9:	learn: 63.4967159	total: 18.4ms	remaining: 1.82s
10:	learn: 60.4452955	total: 20.1ms	remaining: 1.81s
11:	learn: 57.6411359	total: 21.8ms	remaining: 1.79s
12:	learn: 54.9515079	total: 23.6ms	remaining: 1.79s
13:	learn: 52.3224563	total: 25.5ms	remaining: 1.8s
14:	learn: 49.8617311	total: 27.2ms	remaining: 1.79s
15:	learn: 47.5246993	total: 28.9ms	remaining: 1.78s
16:	learn: 45.2626764	total: 30.9ms	remaining: 1.78s
17:

In [36]:
df_results_standart.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ctb,0.021955,0.7538,0.03212
knn,0.014717,0.814492,0.767451
lasso,0.003693,0.746243,0.004721
lin_reg,0.003795,0.754793,0.00532
rf,0.007307,1.014701,0.007418
ridge,0.003795,0.754567,0.00532
svr,0.032644,0.710667,0.708644


## Predictions with use_pct_changes

In [38]:
datasets_use_pct = {
    'emb': {
        'X_train': X_train_emb_use_pct.reset_index(drop=True), 
        'X_test': X_test_emb_use_pct.reset_index(drop=True), 
        'y_train': y_train_use_pct['Close'].reset_index(drop=True), 
        'y_test': y_test_use_pct['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base_use_pct.reset_index(drop=True), 
        'X_test': X_test_base_use_pct.reset_index(drop=True), 
        'y_train': y_train_use_pct['Close'].reset_index(drop=True), 
        'y_test': y_test_use_pct['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb_use_pct.reset_index(drop=True), 
        'X_test': X_test_bs_emb_use_pct.reset_index(drop=True), 
        'y_train': y_train_use_pct['Close'].reset_index(drop=True),  
        'y_test': y_test_use_pct['Close'].reset_index(drop=True),
    },
}

In [39]:
df_results_use_pct = pd.DataFrame([])

In [40]:
for ds_name, data in datasets_use_pct.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test_use_pct, metric_func=MAPE, pct_change=True)
        print('MAPE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results_use_pct = pd.concat([df_results_use_pct, metrics_df])

df_results_use_pct = df_results_use_pct.reset_index()

emb lin_reg
MAPE:  0.015616546928581573
emb ctb
Learning rate set to 0.050336
0:	learn: 0.0076636	total: 13.1ms	remaining: 13.1s
1:	learn: 0.0076419	total: 22ms	remaining: 11s
2:	learn: 0.0076129	total: 32ms	remaining: 10.6s
3:	learn: 0.0075856	total: 43.1ms	remaining: 10.7s
4:	learn: 0.0075578	total: 53.1ms	remaining: 10.6s
5:	learn: 0.0075335	total: 62.6ms	remaining: 10.4s
6:	learn: 0.0075007	total: 71.1ms	remaining: 10.1s
7:	learn: 0.0074693	total: 81.7ms	remaining: 10.1s
8:	learn: 0.0074411	total: 90.3ms	remaining: 9.95s
9:	learn: 0.0074049	total: 99ms	remaining: 9.8s
10:	learn: 0.0073853	total: 108ms	remaining: 9.67s
11:	learn: 0.0073768	total: 116ms	remaining: 9.52s
12:	learn: 0.0073561	total: 124ms	remaining: 9.39s
13:	learn: 0.0073350	total: 132ms	remaining: 9.3s
14:	learn: 0.0072998	total: 142ms	remaining: 9.32s
15:	learn: 0.0072690	total: 152ms	remaining: 9.32s
16:	learn: 0.0072479	total: 162ms	remaining: 9.39s
17:	learn: 0.0072028	total: 172ms	remaining: 9.38s
18:	learn: 0.0

In [41]:
df_results_use_pct.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ctb,0.030522,0.02377,0.031262
knn,0.030844,0.035533,0.035634
lasso,0.028866,0.028866,0.028866
lin_reg,0.029328,0.015617,0.01548
rf,0.030354,0.027579,0.031754
ridge,0.028951,0.031195,0.031696
svr,0.358289,0.377171,0.378243


## Подбор гиперпараметров

In [42]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'lin_reg': {}, 
    'ctb': {
        'depth': [4, 6, 8, 12],
        'learning_rate': [0.06, 0.1, 0.3],
        'iterations': [100, 200, 300, 600]
    },
    'rf': {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, 15],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 15, 30],
        'weights': ['uniform', 'distance']
    },
    'lasso': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'ridge': {
        'alpha': [0.1, 1.0, 10.0]
    },
    'svr': {
        'C': [1, 10, 100],
        'gamma': ['scale', 'auto']
    }
}

def train_and_evaluate_models(datasets, models, param_grids):
    df_results_use_pct = pd.DataFrame()

    for ds_name, data in datasets.items():
        for model_name, model in models.items():
            print(ds_name, model_name)
            
            grid_search = RandomizedSearchCV(model, param_grids[model_name], n_iter=5, scoring='neg_mean_absolute_percentage_error', cv=3)
            grid_search.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
            
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

            metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test_use_pct, metric_func=MAPE, pct_change=True)
            print('MAPE:', metric)

            metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
            df_results_use_pct = pd.concat([df_results_use_pct, metrics_df])

    df_results_use_pct = df_results_use_pct.reset_index()

    return df_results_use_pct


df_results_use_pct = train_and_evaluate_models(datasets_use_pct, models, param_grid)

emb lin_reg




MAPE: 0.012282441669281163
emb ctb
0:	learn: 0.0079823	total: 39.1ms	remaining: 7.78s
1:	learn: 0.0079578	total: 75.8ms	remaining: 7.51s
2:	learn: 0.0079320	total: 124ms	remaining: 8.11s
3:	learn: 0.0079145	total: 174ms	remaining: 8.52s
4:	learn: 0.0078652	total: 235ms	remaining: 9.18s
5:	learn: 0.0078344	total: 285ms	remaining: 9.22s
6:	learn: 0.0077906	total: 324ms	remaining: 8.94s
7:	learn: 0.0077577	total: 360ms	remaining: 8.65s
8:	learn: 0.0077100	total: 399ms	remaining: 8.46s
9:	learn: 0.0076906	total: 449ms	remaining: 8.53s
10:	learn: 0.0076505	total: 501ms	remaining: 8.61s
11:	learn: 0.0076083	total: 556ms	remaining: 8.72s
12:	learn: 0.0075823	total: 603ms	remaining: 8.67s
13:	learn: 0.0075340	total: 641ms	remaining: 8.52s
14:	learn: 0.0074837	total: 678ms	remaining: 8.37s
15:	learn: 0.0074401	total: 728ms	remaining: 8.37s
16:	learn: 0.0073864	total: 775ms	remaining: 8.35s
17:	learn: 0.0073541	total: 818ms	remaining: 8.27s
18:	learn: 0.0073117	total: 862ms	remaining: 8.21s
19:	

KeyboardInterrupt: 