In [1]:
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)

if not dir1 in sys.path:
    sys.path.append(dir1)

os.chdir('..')

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import yaml


from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from src.data.preprocessing import read_data, data_to_np_tensor, preprocess_split
from src.models.ts2vec_src.ts2vec import TS2Vec

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


# Functions

In [3]:
# def stock_embeddigns_to_df(data_tensor: np.ndarray, stocks, dates) -> dict:    
#     res = pd.DataFrame()
#     for i, stock in enumerate(stocks):
#         df = pd.DataFrame(data_tensor[i], index=dates)
#         df.columns = ['emb_' + str(i) for i in range(len(df.columns))]
#         df['Stock'] = stock
#         res = pd.concat([res, df])

#     return res.reset_index(drop=False, names='Datetime')

# def estimate_result(y_test, y_pred, X_test=None, y_start_test=None, metric_func=MAE, pct_change=True):
#     if not pct_change:
#         return metric_func(y_test, y_pred)
    
#     df_preds = X_test.copy()
#     df_preds['Preds'] = y_pred + 1
#     df_preds['Close'] = y_test.reset_index(drop=True) + 1
    
#     starts = y_start_test.sort_values('Stock')['Close'].values

#     pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
#     orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

#     pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'], value_name='Pred')
#     orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'], value_name='True')

#     metric_df = pd.merge(pred_close, orig_close, how='inner', on=['Stock', 'Datetime'])

#     return metric_func(metric_df['True'], metric_df['Pred'])

In [5]:
# def create_label(df):
#     y = df.set_index('Datetime').groupby(
#             ['Stock', pd.Grouper(freq='h')],
#         ).agg({'Close': 'mean'}).reset_index()
#     return y

# def process_time_labels(y, train_start, train_end, test_start, test_end):

#     y_train = y[(y['Datetime'].dt.date >= pd.Timestamp(train_start).date()) & 
#                 (y['Datetime'].dt.date < pd.Timestamp(train_end).date())]

#     y_test = y[(y['Datetime'].dt.date >= pd.Timestamp(test_start).date()) & 
#                (y['Datetime'].dt.date < pd.Timestamp(test_end).date())]

#     return y_train, y_test

# def train_model_ts(train_data, input_dims, output_dims, use_pct_changes_ts2v=False, device=0):
#     model = TS2Vec(
#         input_dims=input_dims,
#         device=device,
#         output_dims=output_dims,
#     )
#     model.fit(train_data, verbose=False)
    
#     return model


# Config

In [6]:
col_agg_finctions = {'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum'}

In [7]:
train_start, train_end = '2023-10-01', '2023-11-01'
test_start, test_end = '2023-11-01', '2023-11-07'

In [8]:
use_pct_changes_X = False
use_pct_changes_ts2v = False
use_pct_changes_labels = False

In [9]:
ts2vec_device = 1
ts2vec_out_dim = 128

n_shifts = 18

In [10]:
ticker_data_path = 'configs/best_stocks_nans_rate.yaml'
ticker_data_path = 'data/all_tickers.csv'

# DataLoading

In [15]:
from typing import Dict

def load_config(conf_path: str) -> Dict[str, str]:
    with open(conf_path, 'r') as f:
        best_stocks = yaml.load(f, Loader=yaml.FullLoader)
    return best_stocks

def data_loading(
    ticker_data_path: str, 
    best_stocks_path: str, 
    filter_best=True
) -> pd.DataFrame:
    df = read_data(ticker_data_path)
    if filter_best:
        best_stocks = load_config(best_stocks_path)
        df = df.query("Stock in @best_stocks")
    return df

# def train_test_split_dt(df, train_start, train_end, test_start, test_end):

#     df_train = df[(df.index.get_level_values('Datetime').dt.date >= pd.Timestamp(train_start).date()) & 
#                  (df.index.get_level_values('Datetime') < pd.Timestamp(train_end).date())]

#     df_test = df[(df.index.get_level_values('Datetime') >= pd.Timestamp(test_start).date()) & 
#                 (df.index.get_level_values('Datetime') < pd.Timestamp(test_end).date())]

#     return df_train, df_test

def general_preprocessing(
    df, 
    agg_freq: str='', 
    X_col_agg_finctions={'Close': 'last'},
) -> pd.DataFrame:

    if agg_freq:
        df = df.set_index('Datetime').groupby(
            ['Stock', pd.Grouper(freq=agg_freq)],
        ).agg(X_col_agg_finctions)

    return df



# def ts2vec_preprocessing(
#         df: pd.DataFrame, 
#         use_pct_changes_data: bool, 
#         use_pct_changes_labels: bool,
#         features = ['Open', 'High', 'Low', 'Close', 'Volume']
# ):
#     stocks_to_save = df['Stocks'].unique()
#     y_train, y_test = process_time_labels(df, train_start, train_end, test_start, test_end)
#     train_data_use_pct = preprocess_split(df, features, train_start, train_end, stocks_to_save)
#     test_data_use_pct = preprocess_split(df, features, test_start, test_end, stocks_to_save)
#     X_train, X_test
    
#     return X_train, X_test, y_train, y_test

def pipeline_data(
    ticker_data_path: str = 'data/all_tickers.csv', 
    best_stocks_path: str = 'configs/best_stocks_nans_rate.yaml', 
    filter_best: bool = True,
    agg_freq: str = 'h',
    col_agg_finctions: Dict[str, str] = {'Close': 'last'},
):
    df = data_loading(ticker_data_path, best_stocks_path, filter_best)
    data = general_preprocessing(
        df,
        agg_freq, 
        col_agg_finctions,
    )

    return data

In [94]:
df = pipeline_data(col_agg_finctions=col_agg_finctions)

In [97]:
def my_pct_change(data):
    return data.pct_change().iloc[1:]

In [99]:
df_test = df.groupby('Stock').pct_change()#.apply(my_pct_change)
df_test = df_test#.reset_index().isna().sum().sum()
df_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,,,,,
AAPL,2023-01-30 18:00:00,-0.008130,-0.001374,-0.005549,-0.001171,0.196849
AAPL,2023-01-30 19:00:00,0.000695,-0.007087,0.000837,-0.006967,-0.382496
AAPL,2023-01-30 20:00:00,-0.004789,-0.000416,-0.002230,0.001528,0.183683
AAPL,2023-01-30 21:00:00,-0.000488,-0.002981,-0.001187,-0.005480,-0.245468
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,0.005430,0.003969,0.008325,0.008106,0.568944
XOM,2024-01-30 19:00:00,0.000964,0.002507,0.005245,0.004360,-0.434654
XOM,2024-01-30 20:00:00,-0.001060,0.000192,0.001353,0.000868,-0.400633
XOM,2024-01-30 21:00:00,0.004533,0.001731,0.000386,-0.000771,0.136162


In [73]:
df.index.names

FrozenList(['Stock', 'Datetime'])

In [83]:
from abc import ABC, abstractmethod
class AbcExperiment(ABC):
    def __init__(
        self, 
        train_start, 
        train_end, 
        test_start, 
        test_end,
        label_name: str = 'Close',
        use_pct_changes_data: bool = False,
        use_pct_changes_labels: bool = False,
    ):
        self.train_start = train_start
        self.train_end = train_end
        self.test_start = test_start
        self.test_end = test_end
        self.label_name = label_name
        self.use_pct_changes_data = use_pct_changes_data
        self.use_pct_changes_labels = use_pct_changes_labels

    @abstractmethod
    def prepare_data(self):
        pass
    
    @abstractmethod
    def fit_model(self, X_train, y_train):
        pass

    @abstractmethod
    def predict(self, X_test):
        pass

    def get_y_start_test(self, y):
        date_array = y.index.get_level_values('Datetime').map(datetime.datetime.date)
        y_train = y[date_array < pd.Timestamp(self.train_end).date()]
        last_train_date = y_train \
            .reset_index() \
            .groupby(['Stock'])['Datetime']\
            .last() \
            .reset_index()
        
        self.y_start_test = y \
            .reset_index() \
            .merge(last_train_date, how='inner', on=['Stock', 'Datetime']) \
            .set_index(['Stock', 'Datetime'])
        
    
    def train_test_split_dt(self, df):
        date_array = df.index.get_level_values('Datetime').map(datetime.datetime.date)
        df_train = df[(date_array >= pd.Timestamp(self.train_start).date()) & 
                    (date_array < pd.Timestamp(self.train_end).date())]

        df_test = df[(date_array >= pd.Timestamp(self.test_start).date()) & 
                    (date_array < pd.Timestamp(self.test_end).date())]
        
        return df_train, df_test
    
    def data_labels_split(self, df):
        X = df.drop(self.label_name, axis=1)
        y = df[self.label_name]
        return X, y

    def estimate_results(
        self,
        y_test, 
        y_pred, 
        X_test=None, 
        metric_func=MAE,         
    ):
        if not self.use_pct_changes_labels:
            return metric_func(y_test, y_pred)
        
        df_preds = X_test.reset_index().copy()
        df_preds['Preds'] = y_pred + 1
        df_preds['Close'] = y_test.reset_index(drop=True) + 1
        
        starts = self.y_start_test.sort_values('Stock')['Close'].values

        pred_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Preds']).cumprod() * starts
        orig_close = df_preds.pivot(columns=['Stock'], index='Datetime', values=['Close']).cumprod() * starts

        pred_close = pred_close['Preds'].reset_index().melt(id_vars=['Datetime'], value_name='Pred')
        orig_close = orig_close['Close'].reset_index().melt(id_vars=['Datetime'], value_name='True')

        metric_df = pd.merge(pred_close, orig_close, how='inner', on=['Stock', 'Datetime'])

        return metric_func(metric_df['True'], metric_df['Pred'])

    def pipeline(self, df, metric_func=MAPE):
        X_train, X_test, y_train, y_test = self.prepare_data(df)

        assert len(X_train) == len(y_train)
        assert len(X_test) == len(y_test)
        assert 'Datetime' in X_train.index.names and 'Stock' in X_train.index.names
        assert 'Datetime' in X_test.index.names and 'Stock' in X_test.index.names
        assert 'Datetime' in y_train.index.names and 'Stock' in y_train.index.names
        assert 'Datetime' in y_test.index.names and 'Stock' in y_test.index.names

        assert self.use_pct_changes_labels and y_train.mean() < 2 and y_test.mean()

        self.fit_model(X_train, y_train)
        preds = self.predict(X_test)
        results = self.estimate_results(y_test, preds, X_test, metric_func)
        return results, preds
    
        
# class ConstantBaselineExperint(AbcExperiment):
#     def __init__(self, constant_method, **kwargs):
#         super().__init__(**kwargs)
        
class LagModelExperint(AbcExperiment):
    def __init__(self, lag_model, window_size=20, **kwargs):
        super().__init__(**kwargs)
        self.model = lag_model
        self.window_size = window_size

    def add_shifts(self, data_for_shifts):
        y = data_for_shifts.copy()

        if self.use_pct_changes_labels:
            self.get_y_start_test(y)
            y = y.groupby('Stock').pct_change().reset_index().set_index()

        X = data_for_shifts.copy()

        if self.use_pct_changes_data:
            X = X.pct_change().iloc[1:]

        for i in range(1, self.window_size + 1):
            X[f'shift_{i}'] = X.groupby(by=['Stock']).shift(i)[self.label_name]

        X = X.dropna().drop(self.label_name, axis=1)
        Xy = X.join(y, how='inner')
        X, y = self.data_labels_split(Xy)
        return X, y

    def prepare_data(self, df):
        data_for_shifts = df[[self.label_name]]
        X, y = self.add_shifts(data_for_shifts)
        X_train, X_test = self.train_test_split_dt(X)
        y_train, y_test = self.train_test_split_dt(y)
        return X_train, X_test, y_train, y_test
    
    def fit_model(self, X_train, y_train, model=None):
        if not model:
            model = self.model
        X, y = X_train.reset_index(drop=True), y_train.reset_index(drop=True)
        model.fit(X, y)

    def predict(self, X_test, model=None):
        if not model:
            model = self.model
        X_test.reset_index(drop=True)
        return model.predict(X_test)
    

class SelfSupervisedExperint(AbcExperiment):
    def __init__(self, head_model, emb_model, **kwargs):
        super().__init__(**kwargs)
        self.model = head_model
        self.emb_model = emb_model

    def prepare_data(self, df):
        X = df.copy() # DATA LEAK

        if self.use_pct_changes_data:
            X = X.pct_change().iloc[1:]

        y = df[[self.label_name]]

        if self.use_pct_changes_labels:
            self.get_y_start_test(y)
            y = y.pct_change().iloc[1:]

        Xy = X.join(y, how='inner')
        X, y = self.data_labels_split(Xy)


        X, y = self.add_shifts(data_for_shifts)
        X_train, X_test = self.train_test_split_dt(X)
        y_train, y_test = self.train_test_split_dt(y)
        return X_train, X_test, y_train, y_test
    
    def fit_model(self, X_train, y_train, model=None):
        if not model:
            model = self.model
        X, y = X_train.reset_index(drop=True), y_train.reset_index(drop=True)
        model.fit(X, y)

    def predict(self, X_test, model=None):
        if not model:
            model = self.model
        X_test.reset_index(drop=True)
        return model.predict(X_test)
        
        

In [84]:
lag_model_exp = LagModelExperint(
    lag_model = LinearRegression(),
    train_start = train_start, 
    train_end = train_end, 
    test_start = test_start, 
    test_end = test_end,
    label_name = 'Close',
    use_pct_changes_data = False,
    use_pct_changes_labels = True,
)

results, preds = lag_model_exp.pipeline(df)
results

0.028808576759016834

In [57]:
results

0.003796133474699634

In [None]:
# df = read_data('data/all_tickers.csv')
# df_best = df.query("Stock in @best_stocks")
# df_best.head()

# Preprocessing

In [None]:
df_agg = df_best.set_index('Datetime').groupby(
    ['Stock', pd.Grouper(freq='h')],
).agg(col_agg_finctions)

df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Open,High,Low,Close,Volume
Stock,Datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
AAPL,2023-01-30 19:00:00,144.07,144.30,143.50,143.95,215404
AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383
...,...,...,...,...,...,...
XOM,2024-01-30 18:00:00,103.69,103.70,102.95,103.22,159389
XOM,2024-01-30 19:00:00,103.79,103.96,103.49,103.67,90110
XOM,2024-01-30 20:00:00,103.68,103.98,103.63,103.76,54009
XOM,2024-01-30 21:00:00,104.15,104.16,103.67,103.68,61363


In [11]:
df_original = df_agg.reset_index()
if use_pct_changes_X:
    df = df_agg.groupby('Stock').pct_change().reset_index()
else:
    df = df_original
df.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [12]:
df_use_pct = df_agg.groupby('Stock').pct_change().reset_index()
df_standart = df_original
df_standart.head()

Unnamed: 0,Stock,Datetime,Open,High,Low,Close,Volume
0,AAPL,2023-01-30 17:00:00,145.15,145.53,144.18,145.13,291457
1,AAPL,2023-01-30 18:00:00,143.97,145.33,143.38,144.96,348830
2,AAPL,2023-01-30 19:00:00,144.07,144.3,143.5,143.95,215404
3,AAPL,2023-01-30 20:00:00,143.38,144.24,143.18,144.17,254970
4,AAPL,2023-01-30 21:00:00,143.31,143.81,143.01,143.38,192383


In [13]:
y_train, y_test = process_time_labels(train_start, train_end, test_start, test_end)

In [14]:
train_data_use_pct = preprocess_split(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks.keys())
test_data_use_pct = preprocess_split(df_use_pct, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks.keys())

train_data_standart = preprocess_split(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], train_start, train_end, best_stocks.keys())
test_data_standart = preprocess_split(df_standart, ['Open', 'High', 'Low', 'Close', 'Volume'], test_start, test_end, best_stocks.keys())

NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0
NAN in data  0


## TS2Vec

In [15]:
train_ts_use_pct = data_to_np_tensor(train_data_use_pct)
test_ts_use_pct = data_to_np_tensor(test_data_use_pct)

train_ts_standart = data_to_np_tensor(train_data_standart)
test_ts_standart = data_to_np_tensor(test_data_standart)
train_ts_use_pct.shape, test_ts_use_pct.shape

((28, 132, 5), (28, 24, 5))

In [16]:
model = train_model_ts(train_ts_standart.shape[2], ts2vec_out_dim, use_pct_changes_ts2v)


In [17]:
train_data = train_ts_use_pct if use_pct_changes_ts2v else train_ts_standart
test_data = test_ts_use_pct if use_pct_changes_ts2v else test_ts_standart

train_repr = model.encode(train_data)
test_repr = model.encode(test_data)
train_repr.shape, test_repr.shape

((28, 132, 128), (28, 24, 128))

In [18]:
X_train_emb= stock_embeddigns_to_df(train_repr, stocks=train_data_standart['Open'].columns, dates=train_data_standart['Open'].index)
X_test_emb = stock_embeddigns_to_df(test_repr, stocks=test_data_standart['Open'].columns, dates=test_data_standart['Open'].index)


In [20]:
X_emb = pd.concat([X_train_emb, X_test_emb])
y = pd.concat([y_train, y_test])

X_emb.to_csv('data/TS2Vec/x_emb_pct_change.csv', index=False)
y.to_csv('data/TS2Vec/y_pct_change.csv', index=False)

## baseline

In [21]:
def calculate_mean_close(df):
    X_baseline = df.set_index('Datetime').groupby(
        ['Stock', pd.Grouper(freq='h')],
    ).agg({'Close': 'mean'}).reset_index()
    return X_baseline

X_baseline = calculate_mean_close(df)

In [22]:
def add_shifts(df, n_shifts):
    df_copy = df.copy()
    for i in range(1, n_shifts + 1):
        df_copy[f'shift_{i}'] = df_copy.groupby(by=['Stock']).shift(i)['Close']
    df_copy = df_copy.dropna().drop(columns='Close')
    return df_copy

X_baseline= add_shifts(X_baseline, n_shifts)
X_baseline.head()  

Unnamed: 0,Stock,Datetime,shift_1,shift_2,shift_3,shift_4,shift_5,shift_6,shift_7,shift_8,shift_9,shift_10,shift_11,shift_12,shift_13,shift_14,shift_15,shift_16,shift_17,shift_18
18,AAPL,2023-02-02 17:00:00,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96,145.13
19,AAPL,2023-02-02 18:00:00,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95,144.96
20,AAPL,2023-02-02 19:00:00,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17,143.95
21,AAPL,2023-02-02 20:00:00,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38,144.17
22,AAPL,2023-02-02 21:00:00,149.81,150.02,148.78,149.35,141.93,142.15,142.4,142.59,142.89,143.87,143.67,143.46,143.29,143.22,142.44,142.46,143.32,143.38


In [23]:
def filter_by_date_range(df, start_date, end_date):
    filtered_df = df[(df['Datetime'].dt.date >= pd.Timestamp(start_date).date()) & 
                     (df['Datetime'].dt.date < pd.Timestamp(end_date).date())]
    return filtered_df


X_train_base = filter_by_date_range(X_baseline, train_start, train_end)
X_test_base= filter_by_date_range(X_baseline, test_start, test_end)

In [24]:
def merge_data(df_original, X_train_base, column_name):
    orig_vals = df_original.reset_index()
    
    last_train_date = X_train_base.groupby(['Stock'], as_index=False).last()[['Stock', 'Datetime']]
    
    y_start_test = orig_vals.merge(last_train_date, how='inner', on=['Stock', 'Datetime'])
    
    return y_start_test


y_start_test = merge_data(df_original, X_train_base, 'Stock')


y_start_test.head()


Unnamed: 0,index,Stock,Datetime,Open,High,Low,Close,Volume
0,1142,AAPL,2023-10-31 22:00:00,170.85,170.9,170.385,170.62,338198
1,2649,ABBV,2023-10-31 22:00:00,141.42,141.65,139.91,140.19,86557
2,4156,ABT,2023-10-31 22:00:00,94.53,94.88,94.35,94.84,56242
3,5663,AMD,2023-10-31 22:00:00,98.49,98.8,98.1,98.44,386922
4,7170,BAC,2023-10-31 22:00:00,26.355,26.375,26.275,26.28,579334


## ts2vec + baseline

In [25]:
X_train_bs_emb = pd.merge(X_train_base, X_train_emb, on=['Stock', 'Datetime'], how='inner')
X_test_bs_emb = pd.merge(X_test_base, X_test_emb, on=['Stock', 'Datetime'], how='inner')

## Проверка корректности пайплайна

# Prediction 

In [34]:
datasets_standart = {
    'emb': {
        'X_train': X_train_emb.reset_index(drop=True), 
        'X_test': X_test_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'base': {
        'X_train': X_train_base.reset_index(drop=True), 
        'X_test': X_test_base.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True), 
        'y_test': y_test['Close'].reset_index(drop=True),
    },
    'emb_base': {
        'X_train': X_train_bs_emb.reset_index(drop=True), 
        'X_test': X_test_bs_emb.reset_index(drop=True), 
        'y_train': y_train['Close'].reset_index(drop=True),  
        'y_test': y_test['Close'].reset_index(drop=True),
    },
}


models = {
    'lin_reg': LinearRegression(),
    'ctb': CatBoostRegressor(),
    'rf': RandomForestRegressor(),
    'knn' : KNeighborsRegressor(),
    'lasso' : Lasso(),
    'ridge' : Ridge(),
    'svr' : SVR(),
    'xgb' : XGBRegressor()
    
}

In [35]:
df_results_standart = pd.DataFrame([])

In [36]:
for ds_name, data in datasets_standart.items():
    for model_name, model in models.items():
        
        print(ds_name, model_name)
        model.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
        y_pred = model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

        metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=use_pct_changes_X)
        print('MAPE: ', metric)

        metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
        df_results_standart = pd.concat([df_results_standart, metrics_df])

df_results_standart = df_results_standart.reset_index()

emb lin_reg
MAPE:  0.8423631072518977
emb ctb
Learning rate set to 0.050336
0:	learn: 102.7607409	total: 54.1ms	remaining: 54.1s
1:	learn: 102.0395169	total: 60ms	remaining: 29.9s
2:	learn: 101.3674726	total: 64.8ms	remaining: 21.5s
3:	learn: 100.7562772	total: 69.8ms	remaining: 17.4s
4:	learn: 100.0901157	total: 74.8ms	remaining: 14.9s
5:	learn: 99.5123524	total: 79.7ms	remaining: 13.2s
6:	learn: 98.8221710	total: 84.3ms	remaining: 12s
7:	learn: 98.2751948	total: 88.5ms	remaining: 11s
8:	learn: 97.7082104	total: 92.6ms	remaining: 10.2s
9:	learn: 97.0423270	total: 95.7ms	remaining: 9.47s
10:	learn: 96.4701505	total: 98.7ms	remaining: 8.87s
11:	learn: 95.8306665	total: 102ms	remaining: 8.37s
12:	learn: 95.1257429	total: 105ms	remaining: 7.95s
13:	learn: 94.6182171	total: 108ms	remaining: 7.58s
14:	learn: 94.0516788	total: 111ms	remaining: 7.26s
15:	learn: 93.5396427	total: 113ms	remaining: 6.98s
16:	learn: 93.0549367	total: 116ms	remaining: 6.72s
17:	learn: 92.6810995	total: 119ms	remai

  model = cd_fast.enet_coordinate_descent(
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


MAPE:  0.8413745030539419
emb svr
MAPE:  0.6976836023746099
emb xgb
MAPE:  0.8316184991057112
base lin_reg
MAPE:  0.003795196794834199
base ctb
Learning rate set to 0.050336
0:	learn: 98.5503899	total: 3.06ms	remaining: 3.05s
1:	learn: 93.7981900	total: 5.11ms	remaining: 2.55s
2:	learn: 89.2930999	total: 7.13ms	remaining: 2.37s
3:	learn: 85.0748620	total: 9.09ms	remaining: 2.26s
4:	learn: 80.9919011	total: 11.1ms	remaining: 2.21s
5:	learn: 77.1375725	total: 90.9ms	remaining: 15.1s
6:	learn: 73.4644262	total: 92.5ms	remaining: 13.1s
7:	learn: 69.9751295	total: 94.2ms	remaining: 11.7s
8:	learn: 66.6227426	total: 95.8ms	remaining: 10.5s
9:	learn: 63.4967159	total: 97.4ms	remaining: 9.64s
10:	learn: 60.4452955	total: 99ms	remaining: 8.9s
11:	learn: 57.6411359	total: 101ms	remaining: 8.29s
12:	learn: 54.9515079	total: 102ms	remaining: 7.77s
13:	learn: 52.3224563	total: 104ms	remaining: 7.31s
14:	learn: 49.8617311	total: 105ms	remaining: 6.93s
15:	learn: 47.5246993	total: 107ms	remaining: 6.

In [37]:
df_results_standart.pivot(index='model', columns='data', values='metric')

data,base,emb,emb_base
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ctb,0.021955,0.636055,0.030643
knn,0.014717,0.807946,0.737928
lasso,0.003693,0.781221,0.0046
lin_reg,0.003795,0.842363,0.004784
rf,0.007087,0.930502,0.007604
ridge,0.003795,0.841375,0.004784
svr,0.032644,0.697684,0.694835
xgb,0.020052,0.831618,0.0206


## Подбор гиперпараметров

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# param_grid = {
#     'lin_reg': {}, 
#     'ctb': {
#         'depth': [4, 6, 8, 12],
#         'learning_rate': [0.06, 0.1, 0.3],
#         'iterations': [100, 200, 300, 600]
#     },
#     'rf': {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [5, 10, 15],
#         'min_samples_split': [2, 5, 10],
#         'min_samples_leaf': [1, 2, 4]
#     },
#     'knn': {
#         'n_neighbors': [3, 5, 7, 15, 30],
#         'weights': ['uniform', 'distance']
#     },
#     'lasso': {
#         'alpha': [0.1, 1.0, 10.0]
#     },
#     'ridge': {
#         'alpha': [0.1, 1.0, 10.0]
#     },
#     'svr': {
#         'C': [1, 10, 100],
#         'gamma': ['scale', 'auto']
#     }
# }

# def train_and_evaluate_models(datasets, models, param_grids):
#     df_results_use_pct = pd.DataFrame()

#     for ds_name, data in datasets.items():
#         for model_name, model in models.items():
#             print(ds_name, model_name)
            
#             grid_search = RandomizedSearchCV(model, param_grids[model_name], n_iter=5, scoring='neg_mean_absolute_percentage_error', cv=3)
#             grid_search.fit(data['X_train'].drop(columns=['Datetime', 'Stock']), data['y_train'])
            
#             best_model = grid_search.best_estimator_
#             y_pred = best_model.predict(data['X_test'].drop(columns=['Datetime', 'Stock']))

#             metric = estimate_result(data['y_test'], y_pred, data['X_test'], y_start_test, metric_func=MAPE, pct_change=True)
#             print('MAPE:', metric)

#             metrics_df = pd.DataFrame.from_dict({'data': [ds_name], 'model': [model_name], 'metric': [metric]})
#             df_results_use_pct = pd.concat([df_results_use_pct, metrics_df])

#     df_results_use_pct = df_results_use_pct.reset_index()

#     return df_results_use_pct


# df_results_use_pct = train_and_evaluate_models(datasets_use_pct, models, param_grid)