In [1]:
#!pip3 install torch torchvision torchaudio

In [3]:
#!python3 -m pip install tensorflow

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

### 1. Load data

In [9]:
dfs_15min = []
dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_15min = load_pkl(f"./data/preproc/15min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    
    dfs_15min += [df_15min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_15min = pd.concat(dfs_15min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_15min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_15min.shape, df_1hour.shape, df_1day.shape

100%|████████████████████████████████████████████████████████████████████████| 67/67 [00:00<00:00, 368.90it/s]


((2362117, 5), (594232, 5), (46870, 5))

In [11]:
df_15min.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.3
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32
2,2022-05-04 10:30:00,32.4,171391.0,AFLT,4.29
3,2022-05-04 10:45:00,32.1,66741.0,AFLT,4.27
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25


In [13]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-05-04 10:00:00,32.1,503380.0,AFLT,4.27
1,2022-05-04 11:00:00,31.38,191664.0,AFLT,4.2
2,2022-05-04 12:00:00,31.8,89256.0,AFLT,4.22
3,2022-05-04 13:00:00,31.36,87929.0,AFLT,4.21
4,2022-05-04 14:00:00,31.54,69671.0,AFLT,4.21


In [15]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-05-04 03:00:00,31.2,1138130.0,AFLT,4.18
1,2022-05-05 03:00:00,30.7,664101.0,AFLT,4.22
2,2022-05-06 03:00:00,30.1,306164.0,AFLT,4.19
3,2022-05-11 03:00:00,29.92,383481.0,AFLT,4.22
4,2022-05-12 03:00:00,29.32,315954.0,AFLT,4.06


### 2. Preproc data

#### 2.1 Make target

In [21]:
def get_target(df_all, ind, val_first, val_second, points_to_wait):
    #цель - достичь val_second, не достигнув val_first

    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]

    start_price = df['close'].iloc[0]
    
    if val_first < val_second:                                 
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else: #short game
        mask_val_first = np.array(df['close'] > val_first)
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):

        ind_end = df.shape[0]-1
        delta_time = df['time'].iloc[ind_end] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_end]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'DNF', delta_time, income_rate, res_price, ind_end+ind
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_second]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_first]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_first]
            if val_first < val_second:
                income_rate = res_price/start_price
            else: #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_second]
            if val_first < val_second:
                income_rate = res_price/start_price
            else:  #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind


def get_df_target(df, indx, percent_first=None, percent_second=None, points_to_wait=None):
    times = []
    results = []
    delta_times = []
    income_rates = []
    closes = []
    tickers = []
    res_prices = []
    res_inds = []
    
    #for ind in indx:
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time, income_rate, res_price, res_ind = get_target(df, ind, val_first, val_second, points_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        income_rates += [income_rate]
        res_prices += [res_price]
        res_inds += [res_ind]
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times,
                              'income_rate' : income_rates,
                              'res_price' : res_prices,
                              'res_ind' : res_inds
                             })

    #макс выигрыш и потери
    if percent_first < percent_second:  
        df_result["income_rate"] = np.maximum(df_result["income_rate"], percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], percent_second) #макс выигрыш
    else:
        df_result["income_rate"] = np.maximum(df_result["income_rate"], 2-percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], 2-percent_second) #макс выигрыш
    
    df_result['income_rate'] -= 0.001
    
    return df_result    

In [25]:
inds = np.arange(df_15min.shape[0])
inds.shape

(2362117,)

In [27]:
df_result = get_df_target(df_15min, inds, percent_first=1.005, percent_second=0.985, points_to_wait=4*4*1)

100%|█████████████████████████████████████████████████████████████| 2362117/2362117 [09:21<00:00, 4203.61it/s]


In [28]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,31.52,LOSE,AFLT,0 days 00:15:00,0.994000,31.92,1
1,1,2022-05-04 10:15:00,31.92,LOSE,AFLT,0 days 00:15:00,0.994000,32.40,2
2,2,2022-05-04 10:30:00,32.40,WIN,AFLT,0 days 00:45:00,1.014000,31.30,5
3,3,2022-05-04 10:45:00,32.10,WIN,AFLT,0 days 00:30:00,1.014000,31.30,5
4,4,2022-05-04 11:00:00,31.94,WIN,AFLT,0 days 00:15:00,1.014000,31.30,5
...,...,...,...,...,...,...,...,...,...
2362112,2362112,2025-02-17 22:45:00,396.18,DNF,BSPB,0 days 01:00:00,0.995517,397.56,2362116
2362113,2362113,2025-02-17 23:00:00,397.09,DNF,BSPB,0 days 00:45:00,0.997816,397.56,2362116
2362114,2362114,2025-02-17 23:15:00,397.52,DNF,BSPB,0 days 00:30:00,0.998899,397.56,2362116
2362115,2362115,2025-02-17 23:30:00,396.48,DNF,BSPB,0 days 00:15:00,0.996276,397.56,2362116


In [29]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.487526
LOSE    0.407979
WIN     0.104495
Name: proportion, dtype: float64

In [31]:
df_result['income_rate'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])

0.00    0.994000
0.01    0.994000
0.10    0.994000
0.20    0.994000
0.30    0.994000
0.40    0.994000
0.50    0.997773
0.60    0.999747
0.70    1.001901
0.80    1.005115
0.90    1.014000
0.99    1.014000
1.00    1.014000
Name: income_rate, dtype: float64

In [33]:
#распределение времени в часах
(df_result['delta_time']//pd.Timedelta('1 hour')).quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])



0.00       0.0
0.01       0.0
0.10       0.0
0.20       1.0
0.30       2.0
0.40       4.0
0.50       4.0
0.60       4.0
0.70       4.0
0.80      13.0
0.90      16.0
0.99      65.0
1.00    6734.0
Name: delta_time, dtype: float64

In [34]:
#остановки в торгах
# mask = (df_result['delta_time']//pd.Timedelta('1 hour')) > 1000
# df_result.loc[mask]

In [37]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [41]:
!mkdir data/feat_engin

mkdir: data/feat_engin: File exists


In [42]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [43]:
dump_pkl(df_result, './data/feat_engin/lgbm/df_result_+0.5_-1.5_4hour.pkl')

#### 2.1.2 Загрузим датасет с таргетами

In [44]:
df_result = load_pkl('./data/feat_engin/lgbm/df_result_+0.5_-1.5_4hour.pkl')

In [45]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,31.52,LOSE,AFLT,0 days 00:15:00,0.994000,31.92,1
1,1,2022-05-04 10:15:00,31.92,LOSE,AFLT,0 days 00:15:00,0.994000,32.40,2
2,2,2022-05-04 10:30:00,32.40,WIN,AFLT,0 days 00:45:00,1.014000,31.30,5
3,3,2022-05-04 10:45:00,32.10,WIN,AFLT,0 days 00:30:00,1.014000,31.30,5
4,4,2022-05-04 11:00:00,31.94,WIN,AFLT,0 days 00:15:00,1.014000,31.30,5
...,...,...,...,...,...,...,...,...,...
2362112,2362112,2025-02-17 22:45:00,396.18,DNF,BSPB,0 days 01:00:00,0.995517,397.56,2362116
2362113,2362113,2025-02-17 23:00:00,397.09,DNF,BSPB,0 days 00:45:00,0.997816,397.56,2362116
2362114,2362114,2025-02-17 23:15:00,397.52,DNF,BSPB,0 days 00:30:00,0.998899,397.56,2362116
2362115,2362115,2025-02-17 23:30:00,396.48,DNF,BSPB,0 days 00:15:00,0.996276,397.56,2362116


In [46]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.487526
LOSE    0.407979
WIN     0.104495
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [49]:
dfs_15min = []
dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):
    df_15min = load_pkl(f"./data/preproc/15min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")


    
    #time_index 1hour
    df_1hour['date_hour_index'] = df_1hour['time'].dt.floor('h')
    
    #time_index 1day
    df_1day['date_index'] = df_1day['time'].dt.date


    
    #time_index 15min
    df_15min['date_hour'] = df_15min['time'].dt.floor('h')
    df_15min['date'] = df_15min['time'].dt.date

    #date_hour (1hour)
    df_join_date_hour = pd.DataFrame({'date_hour' : df_15min['time'].dt.floor('h').drop_duplicates(keep='first')})
    df_join_date_hour['date_hour_index'] = df_join_date_hour['date_hour'].shift(1)
    df_15min = df_15min.merge(df_join_date_hour, how='left', on='date_hour')

    #date (1day)
    df_join_date = pd.DataFrame({'date' : df_15min['time'].dt.date.drop_duplicates(keep='first')})
    df_join_date['date_index'] = df_join_date['date'].shift(1)
    df_15min = df_15min.merge(df_join_date, how='left', on='date')



    
    dfs_15min += [df_15min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    

df_15min = pd.concat(dfs_15min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_15min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)


#порпавим индексы в 10:45 (начало свечи) - у же известна инфа о окончании часовой счеви 10:00
mask = df_15min['time'].dt.minute == 45
df_15min.loc[mask, 'date_hour_index'] = df_15min.loc[mask, 'time'].dt.floor('h')

#аналогично с концом дня
mask = (df_15min['time'].dt.hour == 23) & (df_15min['time'].dt.minute == 45)
df_15min.loc[mask, 'date_index'] = df_15min.loc[mask, 'time'].dt.date
#спец. корректировка для 18:45 (акции без вечерних торгов)
mask = (df_15min['time'].dt.hour == 18) & (df_15min['time'].dt.minute == 45) & (df_15min['time'].dt.day.diff(-1) != 0)
df_15min.loc[mask, 'date_index'] = df_15min.loc[mask, 'time'].dt.date

df_15min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 67/67 [00:01<00:00, 65.84it/s]


((2362117, 9), (594232, 6), (46870, 6))

In [52]:
df_15min.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_index
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.3,2022-05-04 10:00:00,2022-05-04,NaT,
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,2022-05-04 10:00:00,2022-05-04,NaT,
2,2022-05-04 10:30:00,32.4,171391.0,AFLT,4.29,2022-05-04 10:00:00,2022-05-04,NaT,
3,2022-05-04 10:45:00,32.1,66741.0,AFLT,4.27,2022-05-04 10:00:00,2022-05-04,2022-05-04 10:00:00,
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,2022-05-04 11:00:00,2022-05-04,2022-05-04 10:00:00,


In [53]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour_index
0,2022-05-04 10:00:00,32.1,503380.0,AFLT,4.27,2022-05-04 10:00:00
1,2022-05-04 11:00:00,31.38,191664.0,AFLT,4.2,2022-05-04 11:00:00
2,2022-05-04 12:00:00,31.8,89256.0,AFLT,4.22,2022-05-04 12:00:00
3,2022-05-04 13:00:00,31.36,87929.0,AFLT,4.21,2022-05-04 13:00:00
4,2022-05-04 14:00:00,31.54,69671.0,AFLT,4.21,2022-05-04 14:00:00


In [54]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_index
0,2022-05-04 03:00:00,31.2,1138130.0,AFLT,4.18,2022-05-04
1,2022-05-05 03:00:00,30.7,664101.0,AFLT,4.22,2022-05-05
2,2022-05-06 03:00:00,30.1,306164.0,AFLT,4.19,2022-05-06
3,2022-05-11 03:00:00,29.92,383481.0,AFLT,4.22,2022-05-11
4,2022-05-12 03:00:00,29.32,315954.0,AFLT,4.06,2022-05-12


In [55]:
#link different time-step date

df_1hour.reset_index(inplace=True)
df_1hour = df_1hour.rename(columns={col : col+'_1hour' for col in df_1hour.columns if col not in ['date_hour_index', 'ticker']})
df = df_15min.merge(df_1hour, on=['date_hour_index', 'ticker'], how='left')

df_1day.reset_index(inplace=True)
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['date_index', 'ticker']})
df = df.merge(df_1day, on=['date_index', 'ticker'], how='left')

#Проверка что не наджоинилось лишнего
assert df_15min.shape[0] == df.shape[0], 'Error: with join dimensions'

In [56]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_index,index_1hour,time_1hour,close_1hour,volume_1hour,tmos_close_1hour,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.30,2022-05-04 10:00:00,2022-05-04,NaT,,,NaT,,,,,NaT,,,
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,2022-05-04 10:00:00,2022-05-04,NaT,,,NaT,,,,,NaT,,,
2,2022-05-04 10:30:00,32.40,171391.0,AFLT,4.29,2022-05-04 10:00:00,2022-05-04,NaT,,,NaT,,,,,NaT,,,
3,2022-05-04 10:45:00,32.10,66741.0,AFLT,4.27,2022-05-04 10:00:00,2022-05-04,2022-05-04 10:00:00,,0.0,2022-05-04 10:00:00,32.10,503380.0,4.27,,NaT,,,
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,2022-05-04 11:00:00,2022-05-04,2022-05-04 10:00:00,,0.0,2022-05-04 10:00:00,32.10,503380.0,4.27,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.18,3796.0,BSPB,7.30,2025-02-17 22:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362113,2025-02-17 23:00:00,397.09,1388.0,BSPB,7.30,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362114,2025-02-17 23:15:00,397.52,779.0,BSPB,7.31,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362115,2025-02-17 23:30:00,396.48,2099.0,BSPB,7.33,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02


In [57]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): {df.loc[df['close_1hour'].isnull(), ['ticker', 'date_hour']].groupby(['ticker', 'date_hour']).count().shape[0]}  акций-часов')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): {df.loc[df['close_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): 67  акций-часов
Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): 1748  акций-дней


In [58]:
df['close_1hour'].isnull().mean(), df['close_1day'].isnull().mean()

(8.466981102121529e-05, 0.022581438599358118)

In [60]:
df['ticker'].nunique()

67

In [61]:
1748 / 67 # в среднем на акцию пропущено дней

26.08955223880597

In [62]:
df.loc[df['close_1day'].isnull(), 'time'].dt.date.value_counts()

time
2025-01-08    3517
2025-01-03    3511
2024-11-05    3506
2025-01-02    3474
2024-05-10    3470
2024-05-02    3440
2024-06-13    3433
2024-03-11    3085
2024-02-26    3085
2024-01-03    2910
2023-06-13    2595
2023-03-09    2500
2023-05-02    2450
2023-05-10    2395
2023-02-24    2309
2023-01-03    2298
2022-05-04    2273
2022-11-07    1820
2022-06-14     525
2024-03-08      59
2024-02-23      58
2024-06-12      57
2024-12-31      55
2025-01-07      53
2024-11-04      48
2024-05-01      47
2023-06-12      46
2023-05-01      45
2023-03-08      44
2024-05-09      42
2023-05-09      42
2023-02-23      42
2023-01-02      41
2022-11-04      36
2022-12-14      15
2022-06-13      14
Name: count, dtype: int64

In [63]:
#праздники

In [67]:
#Это происходит из-за пропущенных дней в df_1day (праздников), поэтому протянем их ffil
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    cols_ffil = ['index_1day', 'index_1hour']
    df.loc[mask_stock, cols_ffil] = df.loc[mask_stock, cols_ffil].ffill()

100%|█████████████████████████████████████████████████████████████████████████| 67/67 [00:03<00:00, 17.00it/s]


In [68]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): {df.loc[df['index_1hour'].isnull(), ['ticker', 'date_hour']].groupby(['ticker', 'date_hour']).count().shape[0]}  акций-часов')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): {df.loc[df['index_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): 67  акций-часов
Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): 67  акций-дней


In [69]:
df['ticker'].nunique()

67

In [71]:
#ручные проверки
df.iloc[-10035:]

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_index,index_1hour,time_1hour,close_1hour,volume_1hour,tmos_close_1hour,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
2352082,2024-06-07 23:30:00,371.35,2242.0,BSPB,6.62,2024-06-07 23:00:00,2024-06-07,2024-06-07 22:00:00,2024-06-06,591715.0,2024-06-07 22:00:00,369.90,2053.0,6.63,46693.0,2024-06-06 03:00:00,353.69,171632.0,6.59
2352083,2024-06-07 23:45:00,371.14,1253.0,BSPB,6.64,2024-06-07 23:00:00,2024-06-07,2024-06-07 23:00:00,2024-06-07,591716.0,2024-06-07 23:00:00,371.14,4951.0,6.64,46694.0,2024-06-07 03:00:00,371.14,285594.0,6.64
2352084,2024-06-10 10:00:00,374.55,35116.0,BSPB,6.65,2024-06-10 10:00:00,2024-06-10,2024-06-07 23:00:00,2024-06-07,591716.0,2024-06-07 23:00:00,371.14,4951.0,6.64,46694.0,2024-06-07 03:00:00,371.14,285594.0,6.64
2352085,2024-06-10 10:15:00,376.28,18947.0,BSPB,6.67,2024-06-10 10:00:00,2024-06-10,2024-06-07 23:00:00,2024-06-07,591716.0,2024-06-07 23:00:00,371.14,4951.0,6.64,46694.0,2024-06-07 03:00:00,371.14,285594.0,6.64
2352086,2024-06-10 10:30:00,375.89,7599.0,BSPB,6.66,2024-06-10 10:00:00,2024-06-10,2024-06-07 23:00:00,2024-06-07,591716.0,2024-06-07 23:00:00,371.14,4951.0,6.64,46694.0,2024-06-07 03:00:00,371.14,285594.0,6.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.18,3796.0,BSPB,7.30,2025-02-17 22:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362113,2025-02-17 23:00:00,397.09,1388.0,BSPB,7.30,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362114,2025-02-17 23:15:00,397.52,779.0,BSPB,7.31,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02
2362115,2025-02-17 23:30:00,396.48,2099.0,BSPB,7.33,2025-02-17 23:00:00,2025-02-17,2025-02-17 22:00:00,2025-02-14,594230.0,2025-02-17 22:00:00,396.18,10624.0,7.30,46868.0,2025-02-14 03:00:00,390.30,246080.0,7.02


In [73]:
df.columns

Index(['time', 'close', 'volume', 'ticker', 'tmos_close', 'date_hour', 'date',
       'date_hour_index', 'date_index', 'index_1hour', 'time_1hour',
       'close_1hour', 'volume_1hour', 'tmos_close_1hour', 'index_1day',
       'time_1day', 'close_1day', 'volume_1day', 'tmos_close_1day'],
      dtype='object')

In [74]:
df = df[['time', 'close', 'volume', 'ticker', 'tmos_close', 'index_1hour', 'index_1day']]
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.30,,
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,,
2,2022-05-04 10:30:00,32.40,171391.0,AFLT,4.29,,
3,2022-05-04 10:45:00,32.10,66741.0,AFLT,4.27,0.0,
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,0.0,
...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.18,3796.0,BSPB,7.30,594230.0,46868.0
2362113,2025-02-17 23:00:00,397.09,1388.0,BSPB,7.30,594230.0,46868.0
2362114,2025-02-17 23:15:00,397.52,779.0,BSPB,7.31,594230.0,46868.0
2362115,2025-02-17 23:30:00,396.48,2099.0,BSPB,7.33,594230.0,46868.0


#### 2.4 Union target and features. Make data_file to train

In [75]:
(df_result['time'] == df['time']).all(), (df_result['close'] == df['close']).all()

(True, True)

In [76]:
(df.index.values == df_result.index.values).all()

True

In [77]:
df_result.head()

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,31.52,LOSE,AFLT,0 days 00:15:00,0.994,31.92,1
1,1,2022-05-04 10:15:00,31.92,LOSE,AFLT,0 days 00:15:00,0.994,32.4,2
2,2,2022-05-04 10:30:00,32.4,WIN,AFLT,0 days 00:45:00,1.014,31.3,5
3,3,2022-05-04 10:45:00,32.1,WIN,AFLT,0 days 00:30:00,1.014,31.3,5
4,4,2022-05-04 11:00:00,31.94,WIN,AFLT,0 days 00:15:00,1.014,31.3,5


In [78]:
df_result.columns.tolist()

['ind',
 'time',
 'close',
 'result',
 'ticker',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind']

In [79]:
#union
df = pd.concat([df.reset_index(drop=True), df_result[['result', 'delta_time', 'income_rate', 'res_price', 'res_ind']].reset_index(drop=True)], axis=1)
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.30,,,LOSE,0 days 00:15:00,0.994000,31.92,1
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994000,32.40,2
2,2022-05-04 10:30:00,32.40,171391.0,AFLT,4.29,,,WIN,0 days 00:45:00,1.014000,31.30,5
3,2022-05-04 10:45:00,32.10,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.014000,31.30,5
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.014000,31.30,5
...,...,...,...,...,...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.18,3796.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 01:00:00,0.995517,397.56,2362116
2362113,2025-02-17 23:00:00,397.09,1388.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 00:45:00,0.997816,397.56,2362116
2362114,2025-02-17 23:15:00,397.52,779.0,BSPB,7.31,594230.0,46868.0,DNF,0 days 00:30:00,0.998899,397.56,2362116
2362115,2025-02-17 23:30:00,396.48,2099.0,BSPB,7.33,594230.0,46868.0,DNF,0 days 00:15:00,0.996276,397.56,2362116


### 2.5 Feature engineering

In [120]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.30,,,LOSE,0 days 00:15:00,0.994000,31.92,1
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994000,32.40,2
2,2022-05-04 10:30:00,32.40,171391.0,AFLT,4.29,,,WIN,0 days 00:45:00,1.014000,31.30,5
3,2022-05-04 10:45:00,32.10,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.014000,31.30,5
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.014000,31.30,5
...,...,...,...,...,...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.18,3796.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 01:00:00,0.995517,397.56,2362116
2362113,2025-02-17 23:00:00,397.09,1388.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 00:45:00,0.997816,397.56,2362116
2362114,2025-02-17 23:15:00,397.52,779.0,BSPB,7.31,594230.0,46868.0,DNF,0 days 00:30:00,0.998899,397.56,2362116
2362115,2025-02-17 23:30:00,396.48,2099.0,BSPB,7.33,594230.0,46868.0,DNF,0 days 00:15:00,0.996276,397.56,2362116


In [122]:
df_1hour.head()

Unnamed: 0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,tmos_close_1hour,date_hour_index
0,0,2022-05-04 10:00:00,32.1,503380.0,AFLT,4.27,2022-05-04 10:00:00
1,1,2022-05-04 11:00:00,31.38,191664.0,AFLT,4.2,2022-05-04 11:00:00
2,2,2022-05-04 12:00:00,31.8,89256.0,AFLT,4.22,2022-05-04 12:00:00
3,3,2022-05-04 13:00:00,31.36,87929.0,AFLT,4.21,2022-05-04 13:00:00
4,4,2022-05-04 14:00:00,31.54,69671.0,AFLT,4.21,2022-05-04 14:00:00


In [124]:
df_1day.head()

Unnamed: 0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_index
0,0,2022-05-04 03:00:00,31.2,1138130.0,AFLT,4.18,2022-05-04
1,1,2022-05-05 03:00:00,30.7,664101.0,AFLT,4.22,2022-05-05
2,2,2022-05-06 03:00:00,30.1,306164.0,AFLT,4.19,2022-05-06
3,3,2022-05-11 03:00:00,29.92,383481.0,AFLT,4.22,2022-05-11
4,4,2022-05-12 03:00:00,29.32,315954.0,AFLT,4.06,2022-05-12


In [128]:
from sklearn.linear_model import LinearRegression

def calculate_exp_ma(data, window):
    return data.ewm(span=window, min_periods=window).mean().squeeze().values


def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=1).mean().values
    rolling_std = data.rolling(window=window, min_periods=1).std().values

    norm_rolling_std = rolling_std/rolling_mean

    num_of_std = 2
    lower_band_2std = rolling_mean - (rolling_std * num_of_std)
    upper_band_2std = rolling_mean + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = rolling_mean - (rolling_std * num_of_std)
    upper_band_3std = rolling_mean + (rolling_std * num_of_std)
    
    
    return rolling_mean, rolling_std, norm_rolling_std, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change."""
    roc = ((data - data.shift(periods)) / data.shift(periods))
    return roc.values



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std, norm_rolling_std,\
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    #mean_abs_pct
    mean_abs_pct = data.pct_change(periods=1).rolling(window=window, min_periods=1).apply(lambda x: x.abs().mean())
        
    #alpha
    alpha = data.rolling(window=window, min_periods=2).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0])

    #min, max
    rolling_min = data.rolling(window=window, min_periods=1).min().values
    rolling_max = data.rolling(window=window, min_periods=1).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window)
    diff = data.diff(window).values

    #exp_ma
    exp_ma = calculate_exp_ma(data, window)
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_ma_low_2std' : lower_band_2std,
                        f'{feat_name}_ma_up_2std' : upper_band_2std,
                        f'{feat_name}_ma_low_3std' : lower_band_3std,
                        f'{feat_name}_ma_up_3std' : upper_band_3std, 

                        f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        f'{feat_name}_diff' : diff,
                        f'{feat_name}_expma' : exp_ma,
                        })
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : data.pct_change(periods=1).values,
                        f'{feat_name}_diff' : data.diff(1).values,
                        })

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))})
    return df_levels


In [133]:
def calculate_features(df_ticker, postfix=None):
    dfs = [df_ticker]
    
    levels =      [0, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]
    levels_tmos = [0, 0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05]


    #w1
    df_close = calc_stats_diff_1(df_ticker[f'close{postfix}'], feat_name=f'close{postfix}_w1')
    df_volume = calc_stats_diff_1(df_ticker[f'volume{postfix}'], feat_name=f'volume{postfix}_w1')
    df_tmos_close = calc_stats_diff_1(df_ticker[f'tmos_close{postfix}'], feat_name=f'tmos_close{postfix}_w1')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], 'Error w1'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]

    #w5
    window = 5
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w10
    window = 10
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w20
    window = 20
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w30
    window = 30
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]
    
    #w60
    window = 60
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w120
    window = 120
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]

   
    df = pd.concat(dfs, axis=1)
    assert (df_ticker.shape[0] == df.shape[0]) and (df.shape[1] == sum([elem.shape[1] for elem in dfs])), 'Error concat'

    return df


#### 15min

In [138]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='')
    
    dfs += [df_ticker_fe.copy()]

df_fe = pd.concat(dfs).set_index('index')

100%|███████████████████████████████████████████████████████████████████████| 67/67 [1:37:13<00:00, 87.06s/it]


In [139]:
(df_fe['close'] == df['close']).all()

True

In [140]:
df_fe.shape

(2362117, 299)

#### 1hour

In [143]:
dfs = []
for ticker in tqdm(df_1hour['ticker'].unique()):
    mask = np.array(df_1hour['ticker'] == ticker)
    df_ticker = df_1hour.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1hour')
    
    dfs += [df_ticker_fe.copy()]

df_1hour_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 67/67 [24:26<00:00, 21.89s/it]


In [144]:
(df_1hour_fe['close_1hour'] == df_1hour['close_1hour']).all(), (df_1hour_fe.index.values == df_1hour_fe['index_1hour'].values).all()

(True, True)

In [145]:
df_1hour_fe.shape

(594232, 294)

#### 1day

In [149]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1day')
    
    dfs += [df_ticker_fe.copy()]

df_1day_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 67/67 [01:54<00:00,  1.71s/it]


In [150]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index.values == df_1day_fe['index_1day'].values).all()

(True, True)

In [151]:
df_1day_fe.shape

(46870, 294)

In [154]:
dump_pkl(df_fe, './data/feat_engin/df_fe.pkl')

In [155]:
dump_pkl(df_1hour_fe, './data/feat_engin/df_1hour_fe.pkl')

In [156]:
dump_pkl(df_1day_fe, './data/feat_engin/df_1day_fe.pkl')

# Load data

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [5]:
df_fe = load_pkl('./data/feat_engin/df_fe.pkl')
df_1hour_fe = load_pkl('./data/feat_engin/df_1hour_fe.pkl')
df_1day_fe = load_pkl('./data/feat_engin/df_1day_fe.pkl')

df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape, 

((2362117, 299), (594232, 294), (46870, 294))

### time features

In [10]:
#hour
df_fe['hour'] = df_fe['time'].dt.hour

#day
df_fe['day'] = df_fe['time'].dt.day

#day_of_week
df_fe['weekday'] = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4

#month
#df_fe['month'] = df_fe['time'].dt.month


#hour
# time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
# df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
# df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
# day_of_week_cyclic = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4
# df_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
# df_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
# day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
# df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
# df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


In [13]:
df_fe.head()

Unnamed: 0_level_0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,...,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98,tmos_close_w120_lvl_1.03-1.04,tmos_close_w120_lvl_-0.96-0.97,tmos_close_w120_lvl_1.04-1.05,tmos_close_w120_lvl_-0.95-0.96,hour,day,weekday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.3,,,LOSE,0 days 00:15:00,0.994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.5
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.5
2,2022-05-04 10:30:00,32.4,171391.0,AFLT,4.29,,,WIN,0 days 00:45:00,1.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.5
3,2022-05-04 10:45:00,32.1,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.5
4,2022-05-04 11:00:00,31.94,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11,4,0.5


In [15]:
df_1hour_fe.head()

Unnamed: 0_level_0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,tmos_close_1hour,date_hour_index,close_1hour_w1_roc,close_1hour_w1_diff,volume_1hour_w1_roc,...,tmos_close_1hour_w120_lvl_1.01-1.015,tmos_close_1hour_w120_lvl_-0.985-0.99,tmos_close_1hour_w120_lvl_1.015-1.02,tmos_close_1hour_w120_lvl_-0.98-0.985,tmos_close_1hour_w120_lvl_1.02-1.03,tmos_close_1hour_w120_lvl_-0.97-0.98,tmos_close_1hour_w120_lvl_1.03-1.04,tmos_close_1hour_w120_lvl_-0.96-0.97,tmos_close_1hour_w120_lvl_1.04-1.05,tmos_close_1hour_w120_lvl_-0.95-0.96
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-05-04 10:00:00,32.1,503380.0,AFLT,4.27,2022-05-04 10:00:00,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2022-05-04 11:00:00,31.38,191664.0,AFLT,4.2,2022-05-04 11:00:00,-0.02243,-0.72,-0.619246,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2022-05-04 12:00:00,31.8,89256.0,AFLT,4.22,2022-05-04 12:00:00,0.013384,0.42,-0.53431,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2022-05-04 13:00:00,31.36,87929.0,AFLT,4.21,2022-05-04 13:00:00,-0.013836,-0.44,-0.014867,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2022-05-04 14:00:00,31.54,69671.0,AFLT,4.21,2022-05-04 14:00:00,0.00574,0.18,-0.207645,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_index,close_1day_w1_roc,close_1day_w1_diff,volume_1day_w1_roc,...,tmos_close_1day_w120_lvl_1.01-1.015,tmos_close_1day_w120_lvl_-0.985-0.99,tmos_close_1day_w120_lvl_1.015-1.02,tmos_close_1day_w120_lvl_-0.98-0.985,tmos_close_1day_w120_lvl_1.02-1.03,tmos_close_1day_w120_lvl_-0.97-0.98,tmos_close_1day_w120_lvl_1.03-1.04,tmos_close_1day_w120_lvl_-0.96-0.97,tmos_close_1day_w120_lvl_1.04-1.05,tmos_close_1day_w120_lvl_-0.95-0.96
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-05-04 03:00:00,31.2,1138130.0,AFLT,4.18,2022-05-04,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2022-05-05 03:00:00,30.7,664101.0,AFLT,4.22,2022-05-05,-0.016026,-0.5,-0.416498,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2022-05-06 03:00:00,30.1,306164.0,AFLT,4.19,2022-05-06,-0.019544,-0.6,-0.53898,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2022-05-11 03:00:00,29.92,383481.0,AFLT,4.22,2022-05-11,-0.00598,-0.18,0.252535,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2022-05-12 03:00:00,29.32,315954.0,AFLT,4.06,2022-05-12,-0.020053,-0.6,-0.17609,...,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0


### resize memory

In [23]:
[elem for elem in df_fe.columns if 'ind' in elem]

['index_1hour', 'index_1day', 'res_ind']

In [25]:
cols = [elem for elem in df_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)


  8%|█████▋                                                                 | 24/299 [00:00<00:01, 228.06it/s]

time
ticker
result
delta_time


100%|██████████████████████████████████████████████████████████████████████| 299/299 [00:01<00:00, 156.68it/s]


In [28]:
[elem for elem in df_1hour_fe.columns if 'ind' in elem]

['index_1hour', 'date_hour_index']

In [30]:
cols = [elem for elem in df_1hour_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1hour_fe[col] = df_1hour_fe[col].astype(np.float32)
    except:
        print(col)

 21%|██████████████▌                                                        | 60/292 [00:00<00:00, 316.19it/s]

time_1hour
ticker


100%|██████████████████████████████████████████████████████████████████████| 292/292 [00:00<00:00, 451.29it/s]


In [33]:
[elem for elem in df_1day_fe.columns if 'ind' in elem]

['index_1day', 'date_index']

In [35]:
cols = [elem for elem in df_1day_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

100%|█████████████████████████████████████████████████████████████████████| 292/292 [00:00<00:00, 4519.71it/s]

time_1day
ticker





### Delete useless (dublicated) columns


In [39]:
del df_1hour_fe['date_hour_index']

In [41]:
del df_1day_fe['date_index']

### Absolute value columns

In [45]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False

    # if col_name in ['volume', 'tmos_close', 'close_1day', 'volume_1day', 'tmos_close_1day']: #'close', 
    #     return True
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False



In [47]:
cols_del_15min = [elem for elem in df_fe.columns if flag_delete(elem)]
len(cols_del_15min), cols_del_15min

(153,
 ['close_w1_diff',
  'volume_w1_diff',
  'tmos_close_w1_diff',
  'close_w5_ma',
  'close_w5_std',
  'close_w5_ma_low_2std',
  'close_w5_ma_up_2std',
  'close_w5_ma_low_3std',
  'close_w5_ma_up_3std',
  'close_w5_min',
  'close_w5_max',
  'close_w5_diff',
  'close_w5_expma',
  'volume_w5_ma',
  'volume_w5_std',
  'volume_w5_ma_low_2std',
  'volume_w5_ma_up_2std',
  'volume_w5_ma_low_3std',
  'volume_w5_ma_up_3std',
  'volume_w5_min',
  'volume_w5_max',
  'volume_w5_diff',
  'volume_w5_expma',
  'tmos_close_w5_ma',
  'tmos_close_w5_std',
  'tmos_close_w5_ma_low_2std',
  'tmos_close_w5_ma_up_2std',
  'tmos_close_w5_ma_low_3std',
  'tmos_close_w5_ma_up_3std',
  'tmos_close_w5_min',
  'tmos_close_w5_max',
  'tmos_close_w5_diff',
  'tmos_close_w5_expma',
  'close_w10_ma',
  'close_w10_std',
  'close_w10_ma_low_2std',
  'close_w10_ma_up_2std',
  'close_w10_ma_low_3std',
  'close_w10_ma_up_3std',
  'close_w10_min',
  'close_w10_max',
  'close_w10_diff',
  'close_w10_expma',
  'volume_w10

In [49]:
cols_del_1hour = [elem for elem in df_1hour_fe.columns if flag_delete(elem)]
len(cols_del_1hour), cols_del_1hour

(153,
 ['close_1hour_w1_diff',
  'volume_1hour_w1_diff',
  'tmos_close_1hour_w1_diff',
  'close_1hour_w5_ma',
  'close_1hour_w5_std',
  'close_1hour_w5_ma_low_2std',
  'close_1hour_w5_ma_up_2std',
  'close_1hour_w5_ma_low_3std',
  'close_1hour_w5_ma_up_3std',
  'close_1hour_w5_min',
  'close_1hour_w5_max',
  'close_1hour_w5_diff',
  'close_1hour_w5_expma',
  'volume_1hour_w5_ma',
  'volume_1hour_w5_std',
  'volume_1hour_w5_ma_low_2std',
  'volume_1hour_w5_ma_up_2std',
  'volume_1hour_w5_ma_low_3std',
  'volume_1hour_w5_ma_up_3std',
  'volume_1hour_w5_min',
  'volume_1hour_w5_max',
  'volume_1hour_w5_diff',
  'volume_1hour_w5_expma',
  'tmos_close_1hour_w5_ma',
  'tmos_close_1hour_w5_std',
  'tmos_close_1hour_w5_ma_low_2std',
  'tmos_close_1hour_w5_ma_up_2std',
  'tmos_close_1hour_w5_ma_low_3std',
  'tmos_close_1hour_w5_ma_up_3std',
  'tmos_close_1hour_w5_min',
  'tmos_close_1hour_w5_max',
  'tmos_close_1hour_w5_diff',
  'tmos_close_1hour_w5_expma',
  'close_1hour_w10_ma',
  'close_1hou

In [51]:
cols_del_1day = [elem for elem in df_1day_fe.columns if flag_delete(elem)]
len(cols_del_1day), cols_del_1day

(153,
 ['close_1day_w1_diff',
  'volume_1day_w1_diff',
  'tmos_close_1day_w1_diff',
  'close_1day_w5_ma',
  'close_1day_w5_std',
  'close_1day_w5_ma_low_2std',
  'close_1day_w5_ma_up_2std',
  'close_1day_w5_ma_low_3std',
  'close_1day_w5_ma_up_3std',
  'close_1day_w5_min',
  'close_1day_w5_max',
  'close_1day_w5_diff',
  'close_1day_w5_expma',
  'volume_1day_w5_ma',
  'volume_1day_w5_std',
  'volume_1day_w5_ma_low_2std',
  'volume_1day_w5_ma_up_2std',
  'volume_1day_w5_ma_low_3std',
  'volume_1day_w5_ma_up_3std',
  'volume_1day_w5_min',
  'volume_1day_w5_max',
  'volume_1day_w5_diff',
  'volume_1day_w5_expma',
  'tmos_close_1day_w5_ma',
  'tmos_close_1day_w5_std',
  'tmos_close_1day_w5_ma_low_2std',
  'tmos_close_1day_w5_ma_up_2std',
  'tmos_close_1day_w5_ma_low_3std',
  'tmos_close_1day_w5_ma_up_3std',
  'tmos_close_1day_w5_min',
  'tmos_close_1day_w5_max',
  'tmos_close_1day_w5_diff',
  'tmos_close_1day_w5_expma',
  'close_1day_w10_ma',
  'close_1day_w10_std',
  'close_1day_w10_ma_lo

## Concat

In [57]:
df_fe.columns.tolist()

['time',
 'close',
 'volume',
 'ticker',
 'tmos_close',
 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'close_w1_diff',
 'volume_w1_roc',
 'volume_w1_diff',
 'tmos_close_w1_roc',
 'tmos_close_w1_diff',
 'close_w5_ma',
 'close_w5_std',
 'close_w5_norm_std',
 'close_w5_ma_low_2std',
 'close_w5_ma_up_2std',
 'close_w5_ma_low_3std',
 'close_w5_ma_up_3std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_min',
 'close_w5_max',
 'close_w5_rsi',
 'close_w5_roc',
 'close_w5_diff',
 'close_w5_expma',
 'volume_w5_ma',
 'volume_w5_std',
 'volume_w5_norm_std',
 'volume_w5_ma_low_2std',
 'volume_w5_ma_up_2std',
 'volume_w5_ma_low_3std',
 'volume_w5_ma_up_3std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_min',
 'volume_w5_max',
 'volume_w5_rsi',
 'volume_w5_roc',
 'volume_w5_diff',
 'volume_w5_expma',
 'tmos_close_w5_ma',
 'tmos_close_w5_std',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_ma_low_2std',
 'tmos_

In [59]:
df_1hour_fe.columns.tolist()

['index_1hour',
 'time_1hour',
 'close_1hour',
 'volume_1hour',
 'ticker',
 'tmos_close_1hour',
 'close_1hour_w1_roc',
 'close_1hour_w1_diff',
 'volume_1hour_w1_roc',
 'volume_1hour_w1_diff',
 'tmos_close_1hour_w1_roc',
 'tmos_close_1hour_w1_diff',
 'close_1hour_w5_ma',
 'close_1hour_w5_std',
 'close_1hour_w5_norm_std',
 'close_1hour_w5_ma_low_2std',
 'close_1hour_w5_ma_up_2std',
 'close_1hour_w5_ma_low_3std',
 'close_1hour_w5_ma_up_3std',
 'close_1hour_w5_mean_abs_pct',
 'close_1hour_w5_alpha',
 'close_1hour_w5_min',
 'close_1hour_w5_max',
 'close_1hour_w5_rsi',
 'close_1hour_w5_roc',
 'close_1hour_w5_diff',
 'close_1hour_w5_expma',
 'volume_1hour_w5_ma',
 'volume_1hour_w5_std',
 'volume_1hour_w5_norm_std',
 'volume_1hour_w5_ma_low_2std',
 'volume_1hour_w5_ma_up_2std',
 'volume_1hour_w5_ma_low_3std',
 'volume_1hour_w5_ma_up_3std',
 'volume_1hour_w5_mean_abs_pct',
 'volume_1hour_w5_alpha',
 'volume_1hour_w5_min',
 'volume_1hour_w5_max',
 'volume_1hour_w5_rsi',
 'volume_1hour_w5_roc',
 

In [61]:
df_1day_fe.columns.tolist()

['index_1day',
 'time_1day',
 'close_1day',
 'volume_1day',
 'ticker',
 'tmos_close_1day',
 'close_1day_w1_roc',
 'close_1day_w1_diff',
 'volume_1day_w1_roc',
 'volume_1day_w1_diff',
 'tmos_close_1day_w1_roc',
 'tmos_close_1day_w1_diff',
 'close_1day_w5_ma',
 'close_1day_w5_std',
 'close_1day_w5_norm_std',
 'close_1day_w5_ma_low_2std',
 'close_1day_w5_ma_up_2std',
 'close_1day_w5_ma_low_3std',
 'close_1day_w5_ma_up_3std',
 'close_1day_w5_mean_abs_pct',
 'close_1day_w5_alpha',
 'close_1day_w5_min',
 'close_1day_w5_max',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'close_1day_w5_diff',
 'close_1day_w5_expma',
 'volume_1day_w5_ma',
 'volume_1day_w5_std',
 'volume_1day_w5_norm_std',
 'volume_1day_w5_ma_low_2std',
 'volume_1day_w5_ma_up_2std',
 'volume_1day_w5_ma_low_3std',
 'volume_1day_w5_ma_up_3std',
 'volume_1day_w5_mean_abs_pct',
 'volume_1day_w5_alpha',
 'volume_1day_w5_min',
 'volume_1day_w5_max',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'volume_1day_w5_diff',
 'volume_1day_w5

In [64]:
df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape

((2362117, 302), (594232, 293), (46870, 293))

In [66]:
df = df_fe.merge(df_1hour_fe, on=['index_1hour', 'ticker'], how='left')
df.shape

(2362117, 593)

In [67]:
df = df.merge(df_1day_fe, on=['index_1day', 'ticker'], how='left')
df.shape

(2362117, 884)

In [70]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,...,tmos_close_1day_w120_lvl_1.01-1.015,tmos_close_1day_w120_lvl_-0.985-0.99,tmos_close_1day_w120_lvl_1.015-1.02,tmos_close_1day_w120_lvl_-0.98-0.985,tmos_close_1day_w120_lvl_1.02-1.03,tmos_close_1day_w120_lvl_-0.97-0.98,tmos_close_1day_w120_lvl_1.03-1.04,tmos_close_1day_w120_lvl_-0.96-0.97,tmos_close_1day_w120_lvl_1.04-1.05,tmos_close_1day_w120_lvl_-0.95-0.96
0,2022-05-04 10:00:00,31.520000,165423.0,AFLT,4.30,,,LOSE,0 days 00:15:00,0.994000,...,,,,,,,,,,
1,2022-05-04 10:15:00,31.920000,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994000,...,,,,,,,,,,
2,2022-05-04 10:30:00,32.400002,171391.0,AFLT,4.29,,,WIN,0 days 00:45:00,1.014000,...,,,,,,,,,,
3,2022-05-04 10:45:00,32.099998,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.014000,...,,,,,,,,,,
4,2022-05-04 11:00:00,31.940001,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.014000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2362112,2025-02-17 22:45:00,396.179993,3796.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 01:00:00,0.995517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2362113,2025-02-17 23:00:00,397.089996,1388.0,BSPB,7.30,594230.0,46868.0,DNF,0 days 00:45:00,0.997816,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2362114,2025-02-17 23:15:00,397.519989,779.0,BSPB,7.31,594230.0,46868.0,DNF,0 days 00:30:00,0.998899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2362115,2025-02-17 23:30:00,396.480011,2099.0,BSPB,7.33,594230.0,46868.0,DNF,0 days 00:15:00,0.996276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
df.loc[df['index_1hour'].isnull(), 'time'].dt.date.unique()

array([datetime.date(2022, 5, 4), datetime.date(2022, 12, 14)],
      dtype=object)

In [75]:
df.loc[df['index_1day'].isnull(), 'time'].dt.date.unique()

array([datetime.date(2022, 5, 4), datetime.date(2022, 12, 14)],
      dtype=object)

In [77]:
#первый день торгов, там нет инфы по торгам предыдущего дня

In [79]:
mask = df['index_1hour'].isnull() & (df['time'].dt.date == datetime.date(2022, 12, 14))

df.loc[mask, 'ticker'].unique()

array(['WUSH'], dtype=object)

In [81]:
# старт торгов акций

### Relative features

In [85]:
groups_15min = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_w1_roc', 'close_w5_alpha', 'close_w10_alpha', 'close_w20_alpha', 'close_w30_alpha', 'close_w60_alpha', 'close_w120_alpha'],
         ['volume_w1_roc', 'volume_w5_alpha', 'volume_w10_alpha', 'volume_w20_alpha'],
         ['tmos_close_w1_roc', 'tmos_close_w5_alpha', 'tmos_close_w10_alpha', 'tmos_close_w20_alpha', 'tmos_close_w30_alpha', 'tmos_close_w60_alpha', 'tmos_close_w120_alpha'],
    
    
         ['close_w1_roc', 'close_w5_roc', 'close_w10_roc', 'close_w20_roc', 'close_w30_roc', 'close_w60_roc', 'close_w120_roc'],
         ['volume_w1_roc', 'volume_w5_roc', 'volume_w10_roc', 'volume_w20_roc'],
         ['tmos_close_w1_roc', 'tmos_close_w5_roc', 'tmos_close_w10_roc', 'tmos_close_w20_roc', 'tmos_close_w30_roc', 'tmos_close_w60_roc', 'tmos_close_w120_roc'],

    
         ['close_w5_mean_abs_pct', 'close_w10_mean_abs_pct', 'close_w20_mean_abs_pct', 'close_w30_mean_abs_pct', 'close_w60_mean_abs_pct', 'close_w120_mean_abs_pct'],
         ['volume_w5_mean_abs_pct', 'volume_w10_mean_abs_pct', 'volume_w20_mean_abs_pct'],
         ['tmos_close_w5_mean_abs_pct', 'tmos_close_w10_mean_abs_pct', 'tmos_close_w20_mean_abs_pct', 'tmos_close_w30_mean_abs_pct', 'tmos_close_w60_mean_abs_pct', 'tmos_close_w120_mean_abs_pct'],


         ['close_w5_std', 'close_w10_std', 'close_w20_std', 'close_w30_std', 'close_w60_std', 'close_w120_std'],
         ['volume_w5_std', 'volume_w10_std', 'volume_w20_std'],
         ['tmos_close_w5_std', 'tmos_close_w10_std', 'tmos_close_w20_std', 'tmos_close_w30_std', 'tmos_close_w60_std', 'tmos_close_w120_std'],


         ['close_w5_norm_std', 'close_w10_norm_std', 'close_w20_norm_std', 'close_w30_norm_std', 'close_w60_norm_std', 'close_w120_norm_std'],
         ['volume_w5_norm_std', 'volume_w10_norm_std', 'volume_w20_norm_std'],
         ['tmos_close_w5_norm_std', 'tmos_close_w10_norm_std', 'tmos_close_w20_norm_std', 'tmos_close_w30_norm_std', 'tmos_close_w60_norm_std', 'tmos_close_w120_norm_std'],


         ['close_w5_rsi', 'close_w10_rsi', 'close_w20_rsi', 'close_w30_rsi', 'close_w60_rsi', 'close_w120_rsi'],
         ['volume_w5_rsi', 'volume_w10_rsi', 'volume_w20_rsi'],
         ['tmos_close_w5_rsi', 'tmos_close_w10_rsi', 'tmos_close_w20_rsi', 'tmos_close_w30_rsi', 'tmos_close_w60_rsi', 'tmos_close_w120_rsi'],


         ['close', 'close_w5_ma', 'close_w10_ma', 'close_w20_ma', 'close_w30_ma', 'close_w60_ma', 'close_w120_ma'],
         ['volume', 'volume_w5_ma', 'volume_w10_ma', 'volume_w20_ma'],
         ['tmos_close', 'tmos_close_w5_ma', 'tmos_close_w10_ma', 'tmos_close_w20_ma', 'tmos_close_w30_ma', 'tmos_close_w60_ma', 'tmos_close_w120_ma'],


         ['close', 'close_w5_expma', 'close_w10_expma', 'close_w20_expma', 'close_w30_expma', 'close_w60_expma', 'close_w120_expma'],
         ['volume', 'volume_w5_expma', 'volume_w10_expma', 'volume_w20_expma'],
         ['tmos_close', 'tmos_close_w5_expma', 'tmos_close_w10_expma', 'tmos_close_w20_expma', 'tmos_close_w30_expma', 'tmos_close_w60_expma', 'tmos_close_w120_expma'],


         {'close' : ['close_w5_min', 'close_w10_min', 'close_w20_min', 'close_w30_min', 'close_w60_min', 'close_w120_min']},
         {'volume' : ['volume_w5_min', 'volume_w10_min', 'volume_w20_min']},
         {'tmos_close' : ['tmos_close_w5_min', 'tmos_close_w10_min', 'tmos_close_w20_min', 'tmos_close_w30_min', 'tmos_close_w60_min', 'tmos_close_w120_min']},

    
         {'close' : ['close_w5_max', 'close_w10_max', 'close_w20_max', 'close_w30_max', 'close_w60_max', 'close_w120_max']},
         {'volume' : ['volume_w5_max', 'volume_w10_max', 'volume_w20_max']},
         {'tmos_close' : ['tmos_close_w5_max', 'tmos_close_w10_max', 'tmos_close_w20_max', 'tmos_close_w30_max', 'tmos_close_w60_max', 'tmos_close_w120_max']},



        #w5
         {'close' : [  'close_w5_ma_low_2std', 'close_w5_ma_up_2std', 'close_w5_ma_low_3std', 'close_w5_ma_up_3std']},
         {'volume' : [  'volume_w5_ma_low_2std', 'volume_w5_ma_up_2std', 'volume_w5_ma_low_3std', 'volume_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w5_ma_low_2std', 'tmos_close_w5_ma_up_2std', 'tmos_close_w5_ma_low_3std', 'tmos_close_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_w10_ma_low_2std', 'close_w10_ma_up_2std', 'close_w10_ma_low_3std', 'close_w10_ma_up_3std']},
         {'volume' : [  'volume_w10_ma_low_2std', 'volume_w10_ma_up_2std', 'volume_w10_ma_low_3std', 'volume_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w10_ma_low_2std', 'tmos_close_w10_ma_up_2std', 'tmos_close_w10_ma_low_3std', 'tmos_close_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_w20_ma_low_2std', 'close_w20_ma_up_2std', 'close_w20_ma_low_3std', 'close_w20_ma_up_3std']},
         {'volume' : [  'volume_w20_ma_low_2std', 'volume_w20_ma_up_2std', 'volume_w20_ma_low_3std', 'volume_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w20_ma_low_2std', 'tmos_close_w20_ma_up_2std', 'tmos_close_w20_ma_low_3std', 'tmos_close_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_w30_ma_low_2std', 'close_w30_ma_up_2std', 'close_w30_ma_low_3std', 'close_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w30_ma_low_2std', 'tmos_close_w30_ma_up_2std', 'tmos_close_w30_ma_low_3std', 'tmos_close_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_w60_ma_low_2std', 'close_w60_ma_up_2std', 'close_w60_ma_low_3std', 'close_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w60_ma_low_2std', 'tmos_close_w60_ma_up_2std', 'tmos_close_w60_ma_low_3std', 'tmos_close_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_w120_ma_low_2std', 'close_w120_ma_up_2std', 'close_w120_ma_low_3std', 'close_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w120_ma_low_2std', 'tmos_close_w120_ma_up_2std', 'tmos_close_w120_ma_low_3std', 'tmos_close_w120_ma_up_3std']},


        #w5
        ['close_w5_min', 'close_w5_max'],
        ['volume_w5_min', 'volume_w5_max'],
        ['tmos_close_w5_min', 'tmos_close_w5_max'],
        #w10
        ['close_w10_min', 'close_w10_max'],
        ['volume_w10_min', 'volume_w10_max'],
        ['tmos_close_w10_min', 'tmos_close_w10_max'],
        #w20
        ['close_w20_min', 'close_w20_max'],
        ['volume_w20_min', 'volume_w20_max'],
        ['tmos_close_w20_min', 'tmos_close_w20_max'],
        #w30
        ['close_w30_min', 'close_w30_max'],
        ['tmos_close_w30_min', 'tmos_close_w30_max'],
        #w60
        ['close_w60_min', 'close_w60_max'],
        ['tmos_close_w60_min', 'tmos_close_w60_max'],
        #w120
        ['close_w120_min', 'close_w120_max'],
        ['tmos_close_w120_min', 'tmos_close_w120_max'],
]

In [87]:
groups_1hour = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1hour_w1_roc', 'close_1hour_w5_alpha', 'close_1hour_w10_alpha', 'close_1hour_w20_alpha', 'close_1hour_w30_alpha', 'close_1hour_w60_alpha', 'close_1hour_w120_alpha'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_alpha', 'volume_1hour_w10_alpha', 'volume_1hour_w20_alpha'],
         ['tmos_close_1hour_w1_roc', 'tmos_close_1hour_w5_alpha', 'tmos_close_1hour_w10_alpha', 'tmos_close_1hour_w20_alpha', 'tmos_close_1hour_w30_alpha', 'tmos_close_1hour_w60_alpha', 'tmos_close_1hour_w120_alpha'],
    
         ['close_1hour_w1_roc', 'close_1hour_w5_roc', 'close_1hour_w10_roc', 'close_1hour_w20_roc', 'close_1hour_w30_roc', 'close_1hour_w60_roc', 'close_1hour_w120_roc'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_roc', 'volume_1hour_w10_roc', 'volume_1hour_w20_roc'],
         ['tmos_close_1hour_w1_roc', 'tmos_close_1hour_w5_roc', 'tmos_close_1hour_w10_roc', 'tmos_close_1hour_w20_roc', 'tmos_close_1hour_w30_roc', 'tmos_close_1hour_w60_roc', 'tmos_close_1hour_w120_roc'],

         ['close_1hour_w5_mean_abs_pct', 'close_1hour_w10_mean_abs_pct', 'close_1hour_w20_mean_abs_pct', 'close_1hour_w30_mean_abs_pct', 'close_1hour_w60_mean_abs_pct', 'close_1hour_w120_mean_abs_pct'],
         ['volume_1hour_w5_mean_abs_pct', 'volume_1hour_w10_mean_abs_pct', 'volume_1hour_w20_mean_abs_pct'],
         ['tmos_close_1hour_w5_mean_abs_pct', 'tmos_close_1hour_w10_mean_abs_pct', 'tmos_close_1hour_w20_mean_abs_pct', 'tmos_close_1hour_w30_mean_abs_pct', 'tmos_close_1hour_w60_mean_abs_pct', 'tmos_close_1hour_w120_mean_abs_pct'],


         ['close_1hour_w5_std', 'close_1hour_w10_std', 'close_1hour_w20_std', 'close_1hour_w30_std', 'close_1hour_w60_std', 'close_1hour_w120_std'],
         ['volume_1hour_w5_std', 'volume_1hour_w10_std', 'volume_1hour_w20_std'],
         ['tmos_close_1hour_w5_std', 'tmos_close_1hour_w10_std', 'tmos_close_1hour_w20_std', 'tmos_close_1hour_w30_std', 'tmos_close_1hour_w60_std', 'tmos_close_1hour_w120_std'],


         ['close_1hour_w5_norm_std', 'close_1hour_w10_norm_std', 'close_1hour_w20_norm_std', 'close_1hour_w30_norm_std', 'close_1hour_w60_norm_std', 'close_1hour_w120_norm_std'],
         ['volume_1hour_w5_norm_std', 'volume_1hour_w10_norm_std', 'volume_1hour_w20_norm_std'],
         ['tmos_close_1hour_w5_norm_std', 'tmos_close_1hour_w10_norm_std', 'tmos_close_1hour_w20_norm_std', 'tmos_close_1hour_w30_norm_std', 'tmos_close_1hour_w60_norm_std', 'tmos_close_1hour_w120_norm_std'],


         ['close_1hour_w5_rsi', 'close_1hour_w10_rsi', 'close_1hour_w20_rsi', 'close_1hour_w30_rsi', 'close_1hour_w60_rsi', 'close_1hour_w120_rsi'],
         ['volume_1hour_w5_rsi', 'volume_1hour_w10_rsi', 'volume_1hour_w20_rsi'],
         ['tmos_close_1hour_w5_rsi', 'tmos_close_1hour_w10_rsi', 'tmos_close_1hour_w20_rsi', 'tmos_close_1hour_w30_rsi', 'tmos_close_1hour_w60_rsi', 'tmos_close_1hour_w120_rsi'],


         ['close', 'close_1hour_w5_ma', 'close_1hour_w10_ma', 'close_1hour_w20_ma', 'close_1hour_w30_ma', 'close_1hour_w60_ma', 'close_1hour_w120_ma'],
         ['volume', 'volume_1hour_w5_ma', 'volume_1hour_w10_ma', 'volume_1hour_w20_ma'],
         ['tmos_close', 'tmos_close_1hour_w5_ma', 'tmos_close_1hour_w10_ma', 'tmos_close_1hour_w20_ma', 'tmos_close_1hour_w30_ma', 'tmos_close_1hour_w60_ma', 'tmos_close_1hour_w120_ma'],


         ['close', 'close_1hour_w5_expma', 'close_1hour_w10_expma', 'close_1hour_w20_expma', 'close_1hour_w30_expma', 'close_1hour_w60_expma', 'close_1hour_w120_expma'],
         ['volume', 'volume_1hour_w5_expma', 'volume_1hour_w10_expma', 'volume_1hour_w20_expma'],
         ['tmos_close', 'tmos_close_1hour_w5_expma', 'tmos_close_1hour_w10_expma', 'tmos_close_1hour_w20_expma', 'tmos_close_1hour_w30_expma', 'tmos_close_1hour_w60_expma', 'tmos_close_1hour_w120_expma'],


         {'close' : ['close_1hour_w5_min', 'close_1hour_w10_min', 'close_1hour_w20_min', 'close_1hour_w30_min', 'close_1hour_w60_min', 'close_1hour_w120_min']},
         {'volume' : ['volume_1hour_w5_min', 'volume_1hour_w10_min', 'volume_1hour_w20_min']},
         {'tmos_close' : ['tmos_close_1hour_w5_min', 'tmos_close_1hour_w10_min', 'tmos_close_1hour_w20_min', 'tmos_close_1hour_w30_min', 'tmos_close_1hour_w60_min', 'tmos_close_1hour_w120_min']},

    
         {'close' : ['close_1hour_w5_max', 'close_1hour_w10_max', 'close_1hour_w20_max', 'close_1hour_w30_max', 'close_1hour_w60_max', 'close_1hour_w120_max']},
         {'volume' : ['volume_1hour_w5_max', 'volume_1hour_w10_max', 'volume_1hour_w20_max']},
         {'tmos_close' : ['tmos_close_1hour_w5_max', 'tmos_close_1hour_w10_max', 'tmos_close_1hour_w20_max', 'tmos_close_1hour_w30_max', 'tmos_close_1hour_w60_max', 'tmos_close_1hour_w120_max']},



        #w5
         {'close' : [  'close_1hour_w5_ma_low_2std', 'close_1hour_w5_ma_up_2std', 'close_1hour_w5_ma_low_3std', 'close_1hour_w5_ma_up_3std']},
         {'volume' : [  'volume_1hour_w5_ma_low_2std', 'volume_1hour_w5_ma_up_2std', 'volume_1hour_w5_ma_low_3std', 'volume_1hour_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w5_ma_low_2std', 'tmos_close_1hour_w5_ma_up_2std', 'tmos_close_1hour_w5_ma_low_3std', 'tmos_close_1hour_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1hour_w10_ma_low_2std', 'close_1hour_w10_ma_up_2std', 'close_1hour_w10_ma_low_3std', 'close_1hour_w10_ma_up_3std']},
         {'volume' : [  'volume_1hour_w10_ma_low_2std', 'volume_1hour_w10_ma_up_2std', 'volume_1hour_w10_ma_low_3std', 'volume_1hour_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w10_ma_low_2std', 'tmos_close_1hour_w10_ma_up_2std', 'tmos_close_1hour_w10_ma_low_3std', 'tmos_close_1hour_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1hour_w20_ma_low_2std', 'close_1hour_w20_ma_up_2std', 'close_1hour_w20_ma_low_3std', 'close_1hour_w20_ma_up_3std']},
         {'volume' : [  'volume_1hour_w20_ma_low_2std', 'volume_1hour_w20_ma_up_2std', 'volume_1hour_w20_ma_low_3std', 'volume_1hour_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w20_ma_low_2std', 'tmos_close_1hour_w20_ma_up_2std', 'tmos_close_1hour_w20_ma_low_3std', 'tmos_close_1hour_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1hour_w30_ma_low_2std', 'close_1hour_w30_ma_up_2std', 'close_1hour_w30_ma_low_3std', 'close_1hour_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w30_ma_low_2std', 'tmos_close_1hour_w30_ma_up_2std', 'tmos_close_1hour_w30_ma_low_3std', 'tmos_close_1hour_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1hour_w60_ma_low_2std', 'close_1hour_w60_ma_up_2std', 'close_1hour_w60_ma_low_3std', 'close_1hour_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w60_ma_low_2std', 'tmos_close_1hour_w60_ma_up_2std', 'tmos_close_1hour_w60_ma_low_3std', 'tmos_close_1hour_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1hour_w120_ma_low_2std', 'close_1hour_w120_ma_up_2std', 'close_1hour_w120_ma_low_3std', 'close_1hour_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w120_ma_low_2std', 'tmos_close_1hour_w120_ma_up_2std', 'tmos_close_1hour_w120_ma_low_3std', 'tmos_close_1hour_w120_ma_up_3std']},


        #w5
        ['close_1hour_w5_min', 'close_1hour_w5_max'],
        ['volume_1hour_w5_min', 'volume_1hour_w5_max'],
        ['tmos_close_1hour_w5_min', 'tmos_close_1hour_w5_max'],
        #w10
        ['close_1hour_w10_min', 'close_1hour_w10_max'],
        ['volume_1hour_w10_min', 'volume_1hour_w10_max'],
        ['tmos_close_1hour_w10_min', 'tmos_close_1hour_w10_max'],
        #w20
        ['close_1hour_w20_min', 'close_1hour_w20_max'],
        ['volume_1hour_w20_min', 'volume_1hour_w20_max'],
        ['tmos_close_1hour_w20_min', 'tmos_close_1hour_w20_max'],
        #w30
        ['close_1hour_w30_min', 'close_1hour_w30_max'],
        ['tmos_close_1hour_w30_min', 'tmos_close_1hour_w30_max'],
        #w60
        ['close_1hour_w60_min', 'close_1hour_w60_max'],
        ['tmos_close_1hour_w60_min', 'tmos_close_1hour_w60_max'],
        #w120
        ['close_1hour_w120_min', 'close_1hour_w120_max'],
        ['tmos_close_1hour_w120_min', 'tmos_close_1hour_w120_max'],
]

In [89]:
groups_1day= [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1day_w1_roc', 'close_1day_w5_alpha', 'close_1day_w10_alpha', 'close_1day_w20_alpha', 'close_1day_w30_alpha', 'close_1day_w60_alpha', 'close_1day_w120_alpha'],
         ['volume_1day_w1_roc', 'volume_1day_w5_alpha', 'volume_1day_w10_alpha', 'volume_1day_w20_alpha'],
         ['tmos_close_1day_w1_roc', 'tmos_close_1day_w5_alpha', 'tmos_close_1day_w10_alpha', 'tmos_close_1day_w20_alpha', 'tmos_close_1day_w30_alpha', 'tmos_close_1day_w60_alpha', 'tmos_close_1day_w120_alpha'],
    
         ['close_1day_w1_roc', 'close_1day_w5_roc', 'close_1day_w10_roc', 'close_1day_w20_roc', 'close_1day_w30_roc', 'close_1day_w60_roc', 'close_1day_w120_roc'],
         ['volume_1day_w1_roc', 'volume_1day_w5_roc', 'volume_1day_w10_roc', 'volume_1day_w20_roc'],
         ['tmos_close_1day_w1_roc', 'tmos_close_1day_w5_roc', 'tmos_close_1day_w10_roc', 'tmos_close_1day_w20_roc', 'tmos_close_1day_w30_roc', 'tmos_close_1day_w60_roc', 'tmos_close_1day_w120_roc'],

         ['close_1day_w5_mean_abs_pct', 'close_1day_w10_mean_abs_pct', 'close_1day_w20_mean_abs_pct', 'close_1day_w30_mean_abs_pct', 'close_1day_w60_mean_abs_pct', 'close_1day_w120_mean_abs_pct'],
         ['volume_1day_w5_mean_abs_pct', 'volume_1day_w10_mean_abs_pct', 'volume_1day_w20_mean_abs_pct'],
         ['tmos_close_1day_w5_mean_abs_pct', 'tmos_close_1day_w10_mean_abs_pct', 'tmos_close_1day_w20_mean_abs_pct', 'tmos_close_1day_w30_mean_abs_pct', 'tmos_close_1day_w60_mean_abs_pct', 'tmos_close_1day_w120_mean_abs_pct'],


         ['close_1day_w5_std', 'close_1day_w10_std', 'close_1day_w20_std', 'close_1day_w30_std', 'close_1day_w60_std', 'close_1day_w120_std'],
         ['volume_1day_w5_std', 'volume_1day_w10_std', 'volume_1day_w20_std'],
         ['tmos_close_1day_w5_std', 'tmos_close_1day_w10_std', 'tmos_close_1day_w20_std', 'tmos_close_1day_w30_std', 'tmos_close_1day_w60_std', 'tmos_close_1day_w120_std'],


         ['close_1day_w5_norm_std', 'close_1day_w10_norm_std', 'close_1day_w20_norm_std', 'close_1day_w30_norm_std', 'close_1day_w60_norm_std', 'close_1day_w120_norm_std'],
         ['volume_1day_w5_norm_std', 'volume_1day_w10_norm_std', 'volume_1day_w20_norm_std'],
         ['tmos_close_1day_w5_norm_std', 'tmos_close_1day_w10_norm_std', 'tmos_close_1day_w20_norm_std', 'tmos_close_1day_w30_norm_std', 'tmos_close_1day_w60_norm_std', 'tmos_close_1day_w120_norm_std'],


         ['close_1day_w5_rsi', 'close_1day_w10_rsi', 'close_1day_w20_rsi', 'close_1day_w30_rsi', 'close_1day_w60_rsi', 'close_1day_w120_rsi'],
         ['volume_1day_w5_rsi', 'volume_1day_w10_rsi', 'volume_1day_w20_rsi'],
         ['tmos_close_1day_w5_rsi', 'tmos_close_1day_w10_rsi', 'tmos_close_1day_w20_rsi', 'tmos_close_1day_w30_rsi', 'tmos_close_1day_w60_rsi', 'tmos_close_1day_w120_rsi'],


         ['close', 'close_1day_w5_ma', 'close_1day_w10_ma', 'close_1day_w20_ma', 'close_1day_w30_ma', 'close_1day_w60_ma', 'close_1day_w120_ma'],
         ['volume', 'volume_1day_w5_ma', 'volume_1day_w10_ma', 'volume_1day_w20_ma'],
         ['tmos_close', 'tmos_close_1day_w5_ma', 'tmos_close_1day_w10_ma', 'tmos_close_1day_w20_ma', 'tmos_close_1day_w30_ma', 'tmos_close_1day_w60_ma', 'tmos_close_1day_w120_ma'],


         ['close', 'close_1day_w5_expma', 'close_1day_w10_expma', 'close_1day_w20_expma', 'close_1day_w30_expma', 'close_1day_w60_expma', 'close_1day_w120_expma'],
         ['volume', 'volume_1day_w5_expma', 'volume_1day_w10_expma', 'volume_1day_w20_expma'],
         ['tmos_close', 'tmos_close_1day_w5_expma', 'tmos_close_1day_w10_expma', 'tmos_close_1day_w20_expma', 'tmos_close_1day_w30_expma', 'tmos_close_1day_w60_expma', 'tmos_close_1day_w120_expma'],


         {'close' : ['close_1day_w5_min', 'close_1day_w10_min', 'close_1day_w20_min', 'close_1day_w30_min', 'close_1day_w60_min', 'close_1day_w120_min']},
         {'volume' : ['volume_1day_w5_min', 'volume_1day_w10_min', 'volume_1day_w20_min']},
         {'tmos_close' : ['tmos_close_1day_w5_min', 'tmos_close_1day_w10_min', 'tmos_close_1day_w20_min', 'tmos_close_1day_w30_min', 'tmos_close_1day_w60_min', 'tmos_close_1day_w120_min']},

    
         {'close' : ['close_1day_w5_max', 'close_1day_w10_max', 'close_1day_w20_max', 'close_1day_w30_max', 'close_1day_w60_max', 'close_1day_w120_max']},
         {'volume' : ['volume_1day_w5_max', 'volume_1day_w10_max', 'volume_1day_w20_max']},
         {'tmos_close' : ['tmos_close_1day_w5_max', 'tmos_close_1day_w10_max', 'tmos_close_1day_w20_max', 'tmos_close_1day_w30_max', 'tmos_close_1day_w60_max', 'tmos_close_1day_w120_max']},



        #w5
         {'close' : [  'close_1day_w5_ma_low_2std', 'close_1day_w5_ma_up_2std', 'close_1day_w5_ma_low_3std', 'close_1day_w5_ma_up_3std']},
         {'volume' : [  'volume_1day_w5_ma_low_2std', 'volume_1day_w5_ma_up_2std', 'volume_1day_w5_ma_low_3std', 'volume_1day_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w5_ma_low_2std', 'tmos_close_1day_w5_ma_up_2std', 'tmos_close_1day_w5_ma_low_3std', 'tmos_close_1day_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1day_w10_ma_low_2std', 'close_1day_w10_ma_up_2std', 'close_1day_w10_ma_low_3std', 'close_1day_w10_ma_up_3std']},
         {'volume' : [  'volume_1day_w10_ma_low_2std', 'volume_1day_w10_ma_up_2std', 'volume_1day_w10_ma_low_3std', 'volume_1day_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w10_ma_low_2std', 'tmos_close_1day_w10_ma_up_2std', 'tmos_close_1day_w10_ma_low_3std', 'tmos_close_1day_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1day_w20_ma_low_2std', 'close_1day_w20_ma_up_2std', 'close_1day_w20_ma_low_3std', 'close_1day_w20_ma_up_3std']},
         {'volume' : [  'volume_1day_w20_ma_low_2std', 'volume_1day_w20_ma_up_2std', 'volume_1day_w20_ma_low_3std', 'volume_1day_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w20_ma_low_2std', 'tmos_close_1day_w20_ma_up_2std', 'tmos_close_1day_w20_ma_low_3std', 'tmos_close_1day_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1day_w30_ma_low_2std', 'close_1day_w30_ma_up_2std', 'close_1day_w30_ma_low_3std', 'close_1day_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w30_ma_low_2std', 'tmos_close_1day_w30_ma_up_2std', 'tmos_close_1day_w30_ma_low_3std', 'tmos_close_1day_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1day_w60_ma_low_2std', 'close_1day_w60_ma_up_2std', 'close_1day_w60_ma_low_3std', 'close_1day_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w60_ma_low_2std', 'tmos_close_1day_w60_ma_up_2std', 'tmos_close_1day_w60_ma_low_3std', 'tmos_close_1day_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1day_w120_ma_low_2std', 'close_1day_w120_ma_up_2std', 'close_1day_w120_ma_low_3std', 'close_1day_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w120_ma_low_2std', 'tmos_close_1day_w120_ma_up_2std', 'tmos_close_1day_w120_ma_low_3std', 'tmos_close_1day_w120_ma_up_3std']},


        #w5
        ['close_1day_w5_min', 'close_1day_w5_max'],
        ['volume_1day_w5_min', 'volume_1day_w5_max'],
        ['tmos_close_1day_w5_min', 'tmos_close_1day_w5_max'],
        #w10
        ['close_1day_w10_min', 'close_1day_w10_max'],
        ['volume_1day_w10_min', 'volume_1day_w10_max'],
        ['tmos_close_1day_w10_min', 'tmos_close_1day_w10_max'],
        #w20
        ['close_1day_w20_min', 'close_1day_w20_max'],
        ['volume_1day_w20_min', 'volume_1day_w20_max'],
        ['tmos_close_1day_w20_min', 'tmos_close_1day_w20_max'],
        #w30
        ['close_1day_w30_min', 'close_1day_w30_max'],
        ['tmos_close_1day_w30_min', 'tmos_close_1day_w30_max'],
        #w60
        ['close_1day_w60_min', 'close_1day_w60_max'],
        ['tmos_close_1day_w60_min', 'tmos_close_1day_w60_max'],
        #w120
        ['close_1day_w120_min', 'close_1day_w120_max'],
        ['tmos_close_1day_w120_min', 'tmos_close_1day_w120_max'],
]

In [92]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = df[pair[0]] / (df[pair[1]] + np.finfo(np.float32).eps)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = df[pair0] / (df[pair1] + np.finfo(np.float32).eps)

    return df

In [94]:
import warnings
warnings.filterwarnings('ignore')

In [97]:
print(df.shape)
calc_relative_features(df, groups_15min)
df.shape

(2362117, 884)


100%|█████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 19.28it/s]


(2362117, 1313)

In [99]:
1313-884

429

In [101]:
print(df.shape)
calc_relative_features(df, groups_1hour)
df.shape

(2362117, 1313)


100%|█████████████████████████████████████████████████████████████████████████| 60/60 [00:04<00:00, 14.68it/s]


(2362117, 1742)

In [103]:
1742-1313

429

In [104]:
print(df.shape)
calc_relative_features(df, groups_1day)
df.shape

(2362117, 1742)


100%|█████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 18.77it/s]


(2362117, 2171)

In [107]:
2171-1742

429

### Delete absolute valu columns

In [111]:
len(cols_del_15min), len(cols_del_1hour), len(cols_del_1day)

(153, 153, 153)

In [113]:
print(df.shape)
for col in cols_del_15min+cols_del_1hour+cols_del_1day:
    del df[col]
df.shape

(2362117, 2171)


(2362117, 1712)

## 2.6 Save data

In [120]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [122]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_15min_1hour_1day.pkl')

In [123]:
df.shape

(2362117, 1712)

In [126]:
df.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,...,tmos_close_1day_w10_min/tmos_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,volume_1day_w20_min/volume_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,tmos_close_1day_w30_min/tmos_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,tmos_close_1day_w60_min/tmos_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,tmos_close_1day_w120_min/tmos_close_1day_w120_max
0,2022-05-04 10:00:00,31.52,165423.0,AFLT,4.3,,,LOSE,0 days 00:15:00,0.994,...,,,,,,,,,,
1,2022-05-04 10:15:00,31.92,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994,...,,,,,,,,,,
2,2022-05-04 10:30:00,32.400002,171391.0,AFLT,4.29,,,WIN,0 days 00:45:00,1.014,...,,,,,,,,,,
3,2022-05-04 10:45:00,32.099998,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.014,...,,,,,,,,,,
4,2022-05-04 11:00:00,31.940001,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.014,...,,,,,,,,,,


In [129]:
df.columns.tolist()

['time',
 'close',
 'volume',
 'ticker',
 'tmos_close',
 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'tmos_close_w1_roc',
 'close_w5_norm_std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_rsi',
 'close_w5_roc',
 'volume_w5_norm_std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_rsi',
 'volume_w5_roc',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_mean_abs_pct',
 'tmos_close_w5_alpha',
 'tmos_close_w5_rsi',
 'tmos_close_w5_roc',
 'close_w10_norm_std',
 'close_w10_mean_abs_pct',
 'close_w10_alpha',
 'close_w10_rsi',
 'close_w10_roc',
 'volume_w10_norm_std',
 'volume_w10_mean_abs_pct',
 'volume_w10_alpha',
 'volume_w10_rsi',
 'volume_w10_roc',
 'tmos_close_w10_norm_std',
 'tmos_close_w10_mean_abs_pct',
 'tmos_close_w10_alpha',
 'tmos_close_w10_rsi',
 'tmos_close_w10_roc',
 'close_w20_norm_std',
 'close_w20_mean_abs_pct',
 'close_w20_alpha',
 'close_w20_rsi',
 'close_w20_roc',
 

In [133]:
no_features = [
 'time',
 'close',
 'volume',
 'tmos_close',

 'ticker',

 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',

'time_1hour',
 'close_1hour',
 'volume_1hour',
 'tmos_close_1hour',

'time_1day',
 'close_1day',
 'volume_1day',
 'tmos_close_1day',]


### 2.7 Save data to NN

In [128]:
#don't use NN

In [131]:
# df_fe.head()

In [133]:
# df_1day_fe.head()

In [135]:
# df_fe.shape, df_1day_fe.shape

In [137]:
# dump_pkl(df_fe, 'data/feat_engin/lgbm/data_1hour.pkl')
# dump_pkl(df_1day_fe, 'data/feat_engin/lgbm/data_1day.pkl')

# TMP

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [2]:
df = load_pkl('data/feat_engin/lgbm/data_15min_1hour_1day.pkl')

In [7]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,...,tmos_close_1day_w10_min/tmos_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,volume_1day_w20_min/volume_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,tmos_close_1day_w30_min/tmos_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,tmos_close_1day_w60_min/tmos_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,tmos_close_1day_w120_min/tmos_close_1day_w120_max
0,2022-05-04 10:00:00,31.520000,165423.0,AFLT,4.30,,,LOSE,0 days 00:15:00,0.994000,...,,,,,,,,,,
1,2022-05-04 10:15:00,31.920000,99825.0,AFLT,4.32,,,LOSE,0 days 00:15:00,0.994000,...,,,,,,,,,,
2,2022-05-04 10:30:00,32.400002,171391.0,AFLT,4.29,,,WIN,0 days 00:30:00,1.009000,...,,,,,,,,,,
3,2022-05-04 10:45:00,32.099998,66741.0,AFLT,4.27,0.0,,WIN,0 days 00:30:00,1.009000,...,,,,,,,,,,
4,2022-05-04 11:00:00,31.940001,42552.0,AFLT,4.25,0.0,,WIN,0 days 00:15:00,1.009000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2340434,2025-02-07 22:45:00,375.739990,82.0,BSPB,6.49,588791.0,46466.0,DNF,0 days 01:00:00,0.999106,...,0.970859,0.957469,0.186567,0.949386,0.869913,0.898773,0.777189,0.800613,0.750184,0.800613
2340435,2025-02-07 23:00:00,375.709991,109.0,BSPB,6.49,588791.0,46466.0,DNF,0 days 00:45:00,0.999027,...,0.970859,0.957469,0.186567,0.949386,0.869913,0.898773,0.777189,0.800613,0.750184,0.800613
2340436,2025-02-07 23:15:00,375.540009,497.0,BSPB,6.48,588791.0,46466.0,DNF,0 days 00:30:00,0.998574,...,0.970859,0.957469,0.186567,0.949386,0.869913,0.898773,0.777189,0.800613,0.750184,0.800613
2340437,2025-02-07 23:30:00,375.700012,61.0,BSPB,6.48,588791.0,46466.0,DNF,0 days 00:15:00,0.999000,...,0.970859,0.957469,0.186567,0.949386,0.869913,0.898773,0.777189,0.800613,0.750184,0.800613


#### uniq_1

In [135]:
no_analyze = [
 'time',
 # 'close',
 #'volume',
 #'tmos_close',

 'ticker',

 #'index_1hour',
 #'index_1day',
 'result',
 'delta_time',
 #'income_rate',
 #'res_price',
 #'res_ind',

 'time_1hour',
 #'close_1hour',
 #'volume_1hour',
 #'tmos_close_1hour',

 'time_1day',
 #'close_1day',
 #'volume_1day',
 #'tmos_close_1day',
]

In [138]:
# df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].mean().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].mean()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 1706/1706 [01:13<00:00, 23.06it/s]


Unnamed: 0,index,0
0,close,1411.895996
1,volume,32411.751953
2,tmos_close,5.565387
3,index_1hour,297417.044551
4,index_1day,23563.002936
...,...,...
1701,tmos_close_1day_w30_min/tmos_close_1day_w30_max,0.905194
1702,close_1day_w60_min/close_1day_w60_max,0.770984
1703,tmos_close_1day_w60_min/tmos_close_1day_w60_max,0.863854
1704,close_1day_w120_min/close_1day_w120_max,0.678099


In [139]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [140]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

Series([], Name: index, dtype: object)

In [144]:
#гуд

#### uniq_2

In [148]:
#df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].nunique().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].nunique()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 1706/1706 [02:07<00:00, 13.37it/s]


Unnamed: 0,index,0
0,close,122551
1,volume,203435
2,tmos_close,390
3,index_1hour,594110
4,index_1day,46870
...,...,...
1701,tmos_close_1day_w30_min/tmos_close_1day_w30_max,367
1702,close_1day_w60_min/close_1day_w60_max,13408
1703,tmos_close_1day_w60_min/tmos_close_1day_w60_max,337
1704,close_1day_w120_min/close_1day_w120_max,10521


In [149]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [150]:
pd.set_option('display.max_rows', 120)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,10,"[tmos_close_1day_w30_lvl_1.01-1.015, tmos_clos..."
1,11,"[tmos_close_1day_w30_lvl_-0.995-1, tmos_close_..."
2,13,"[tmos_close_1day_w30_lvl_1-1.005, tmos_close_1..."
3,14,"[hour, close_1day_w30_lvl_1.005-1.01, close_1d..."
4,15,"[tmos_close_1day_w30_lvl_-0.96-0.97, tmos_clos..."
...,...,...
119,2354994,"[volume_1hour_w10_ma_up_2std/volume, volume_1h..."
120,2358164,"[close_w5_expma/close_w60_expma, close_w10_exp..."
121,2360842,"[close_w10_alpha/close_w20_alpha, close_w10_ex..."
122,2360844,"[volume_w5_expma/volume_w20_expma, volume_w10_..."


In [152]:
i = 43
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(1112,
 array(['tmos_close_1day_w30_mean_abs_pct',
        'tmos_close_1day_w10_ma/tmos_close_1day_w30_ma'], dtype=object))

In [50]:
np.sort(df['close_1day_w120_mean_abs_pct'].unique())

array([0.        , 0.0003125 , 0.00034014, ..., 0.18763515, 0.26517856,
              nan], dtype=float32)