In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [8]:
stocks = ['ABIO',
 'AFKS',
 'AFLT',
 'ALRS',
 'APTK',
 'AQUA',
 'BANE',
 'BANEP',
 'BELU',
 'BSPB',
 'CBOM',
 'CHMF',
 'ENPG',
 'FEES',
 'FESH',
 'FLOT',
 'GAZP',
 'GMKN',
 'GTRK',
 'HEAD',
 'HYDR',
 'IRAO',
 'IRKT',
 'KMAZ',
 'LENT',
 'LIFE',
 'LKOH',
 'LSRG',
 'MAGN',
 'MDMG',
 'MGNT',
 'MOEX',
 'MTLR',
 'MTLRP',
 'MTSS',
 'MVID',
 'NLMK',
 'NMTP',
 'NVTK',
 'OGKB',
 'PHOR',
 'PIKK',
 'PLZL',
 'POSI',
 'RASP',
 'RENI',
 'RNFT',
 'ROSN',
 'RTKM',
 'RTKMP',
 'RUAL',
 'SBER',
 'SBERP',
 'SELG',
 'SFIN',
 'SGZH',
 'SIBN',
 'SMLT',
 'SNGS',
 'SNGSP',
 'SPBE',
 'SVAV',
 'T',
 'TATN',
 'TATNP',
 'TGKN',
 'TRMK',
 'TRNFP',
 'UNAC',
 'UPRO',
 'UWGN',
 'VKCO',
 'VSMO',
 'VTBR',
 'WUSH',
 'YDEX',


#'TMOS',
]

len(stocks)

76

### 1. Load data

In [13]:
dfs_5min = []
dfs_1hour = []
dfs_1day = []

#stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_5min.shape, df_1hour.shape, df_1day.shape

100%|████████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 234.97it/s]


((7397129, 5), (693950, 5), (50955, 5))

In [15]:
df_5min.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.0
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87
2,2022-07-01 10:05:00,56.5,3630.0,ABIO,3.89
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93


In [17]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.0
1,2022-07-01 10:00:00,56.4,13083.0,ABIO,3.93
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99


In [19]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,tmos_close
0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02
1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96
2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92
3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97
4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96


### 2. Preproc data

#### 2.1 Make target

In [61]:
def get_target(df_all, ind, val_first, val_second, points_to_wait):
    #цель - достичь val_second, не достигнув val_first

    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]

    start_price = df['close'].iloc[0]
    
    if val_first < val_second:                                 
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else: #short game
        mask_val_first = np.array(df['close'] > val_first)
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):

        ind_end = df.shape[0]-1
        delta_time = df['time'].iloc[ind_end] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_end]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'DNF', delta_time, income_rate, res_price, ind_end+ind
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_second]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_first]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_first]
            if val_first < val_second:
                income_rate = res_price/start_price
            else: #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_second]
            if val_first < val_second:
                income_rate = res_price/start_price
            else:  #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind


def get_df_target(df, indx, percent_first=None, percent_second=None, points_to_wait=None):
    times = []
    results = []
    delta_times = []
    income_rates = []
    closes = []
    tickers = []
    res_prices = []
    res_inds = []
    
    #for ind in indx:
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time, income_rate, res_price, res_ind = get_target(df, ind, val_first, val_second, points_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        income_rates += [income_rate]
        res_prices += [res_price]
        res_inds += [res_ind]
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times,
                              'income_rate' : income_rates,
                              'res_price' : res_prices,
                              'res_ind' : res_inds
                             })

    #макс выигрыш и потери
    if percent_first < percent_second:  
        df_result["income_rate"] = np.maximum(df_result["income_rate"], percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], percent_second) #макс выигрыш
    else:
        df_result["income_rate"] = np.maximum(df_result["income_rate"], 2-percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], 2-percent_second) #макс выигрыш
    
    df_result['income_rate'] -= 0.001
    
    return df_result    

In [65]:
inds = np.arange(df_5min.shape[0])
inds.shape

(7397129,)

In [67]:
df_result = get_df_target(df_5min, inds, percent_first=1.005, percent_second=0.987, points_to_wait=4*12*1)

100%|█████████████████████████████████████████████████████████████| 7397129/7397129 [30:40<00:00, 4018.58it/s]


In [68]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 09:55:00,57.78,WIN,ABIO,0 days 00:05:00,1.012000,57.02,1
1,1,2022-07-01 10:00:00,57.02,WIN,ABIO,0 days 00:10:00,1.012000,56.22,3
2,2,2022-07-01 10:05:00,56.50,LOSE,ABIO,0 days 02:20:00,0.994000,57.00,30
3,3,2022-07-01 10:10:00,56.22,LOSE,ABIO,0 days 02:10:00,0.994000,56.74,29
4,4,2022-07-01 10:15:00,56.46,LOSE,ABIO,0 days 02:10:00,0.994000,57.00,30
...,...,...,...,...,...,...,...,...,...
7397124,7397124,2025-02-24 23:25:00,4668.50,DNF,YDEX,0 days 00:20:00,0.998464,4671.00,7397128
7397125,7397125,2025-02-24 23:30:00,4668.50,DNF,YDEX,0 days 00:15:00,0.998464,4671.00,7397128
7397126,7397126,2025-02-24 23:35:00,4669.00,DNF,YDEX,0 days 00:10:00,0.998572,4671.00,7397128
7397127,7397127,2025-02-24 23:40:00,4664.50,DNF,YDEX,0 days 00:05:00,0.997606,4671.00,7397128


In [69]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.458019
DNF     0.402689
WIN     0.139292
Name: proportion, dtype: float64

In [71]:
df_result['income_rate'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])

0.00    0.994000
0.01    0.994000
0.10    0.994000
0.20    0.994000
0.30    0.994000
0.40    0.994000
0.50    0.996819
0.60    0.999334
0.70    1.001665
0.80    1.005250
0.90    1.012000
0.99    1.012000
1.00    1.012000
Name: income_rate, dtype: float64

In [73]:
#распределение времени в часах
(df_result['delta_time']/pd.Timedelta('1 hour')).quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])



0.00       0.000000
0.01       0.083333
0.10       0.416667
0.20       1.000000
0.30       1.833333
0.40       3.250000
0.50       4.000000
0.60       4.000000
0.70       4.250000
0.80      12.833333
0.90      15.750000
0.99      64.750000
1.00    1146.333333
Name: delta_time, dtype: float64

In [75]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [79]:
!mkdir data/feat_engin

mkdir: data/feat_engin: File exists


In [80]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [81]:
dump_pkl(df_result, './data/feat_engin/lgbm/df_result_+0.5_-1.3_4hour.pkl')

#### 2.1.2 Загрузим датасет с таргетами

In [102]:
df_result = load_pkl('./data/feat_engin/lgbm/df_result_+0.5_-1.3_4hour.pkl')

In [104]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 09:55:00,57.78,WIN,ABIO,0 days 00:05:00,1.012000,57.02,1
1,1,2022-07-01 10:00:00,57.02,WIN,ABIO,0 days 00:10:00,1.012000,56.22,3
2,2,2022-07-01 10:05:00,56.50,LOSE,ABIO,0 days 02:20:00,0.994000,57.00,30
3,3,2022-07-01 10:10:00,56.22,LOSE,ABIO,0 days 02:10:00,0.994000,56.74,29
4,4,2022-07-01 10:15:00,56.46,LOSE,ABIO,0 days 02:10:00,0.994000,57.00,30
...,...,...,...,...,...,...,...,...,...
7397124,7397124,2025-02-24 23:25:00,4668.50,DNF,YDEX,0 days 00:20:00,0.998464,4671.00,7397128
7397125,7397125,2025-02-24 23:30:00,4668.50,DNF,YDEX,0 days 00:15:00,0.998464,4671.00,7397128
7397126,7397126,2025-02-24 23:35:00,4669.00,DNF,YDEX,0 days 00:10:00,0.998572,4671.00,7397128
7397127,7397127,2025-02-24 23:40:00,4664.50,DNF,YDEX,0 days 00:05:00,0.997606,4671.00,7397128


In [106]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.458019
DNF     0.402689
WIN     0.139292
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [203]:
dfs_5min = []
dfs_1hour = []
dfs_1day = []

#stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")


    
    #time_index 1hour
    df_1hour['date_hour_index'] = df_1hour['time'].dt.floor('h')
    
    #time_index 1day
    df_1day['date_day_index'] = df_1day['time'].dt.date


    
    #time_index 5min
    df_5min['date_hour'] = df_5min['time'].dt.floor('h')
    df_5min['date'] = df_5min['time'].dt.date

    #date_hour (1hour)
    df_join_date_hour = pd.DataFrame({'date_hour' : df_5min['time'].dt.floor('h').drop_duplicates(keep='first')})
    df_join_date_hour['date_hour_index'] = df_join_date_hour['date_hour'].shift(1)
    df_5min = df_5min.merge(df_join_date_hour, how='left', on='date_hour')

    #date (1day)
    df_join_date = pd.DataFrame({'date' : df_5min['time'].dt.date.drop_duplicates(keep='first')})
    df_join_date['date_day_index'] = df_join_date['date'].shift(1)
    df_5min = df_5min.merge(df_join_date, how='left', on='date')



    
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    

df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

#date_hour_index
#порпавим индексы в 10:55 (начало свечи) - у же известна инфа о окончании часовой свечи 10:00
mask = df_5min['time'].dt.minute == 55
df_5min.loc[mask, 'date_hour_index'] = df_5min.loc[mask, 'time'].dt.floor('h')

#date_day_index
#аналогично с концом дня
mask = (df_5min['time'].dt.hour == 23) & (df_5min['time'].dt.minute == 45)
df_5min.loc[mask, 'date_day_index'] = df_5min.loc[mask, 'time'].dt.date
df_5min.loc[mask, 'date_hour_index'] = df_5min.loc[mask, 'time'].dt.floor('h')
#спец. корректировка для 18:45 (акции без вечерних торгов)
mask = (df_5min['time'].dt.hour == 18) & (df_5min['time'].dt.minute == 45) & (df_5min['time'].dt.day.diff(-1) != 0)
df_5min.loc[mask, 'date_day_index'] = df_5min.loc[mask, 'time'].dt.date
df_5min.loc[mask, 'date_hour_index'] = df_5min.loc[mask, 'time'].dt.floor('h')


df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [00:02<00:00, 27.91it/s]


((7397129, 9), (693950, 6), (50955, 6))

In [206]:
#проверка глазами
i = 123542 + 12*6
df_5min.iloc[i:i+20]

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_day_index
123614,2023-07-25 17:15:00,17.858,4020.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123615,2023-07-25 17:20:00,17.855,3313.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123616,2023-07-25 17:25:00,17.862,2757.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123617,2023-07-25 17:30:00,17.87,4527.0,AFKS,5.88,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123618,2023-07-25 17:35:00,17.869,3151.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123619,2023-07-25 17:40:00,17.881,5027.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123620,2023-07-25 17:45:00,17.891,4956.0,AFKS,5.88,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123621,2023-07-25 17:50:00,17.896,3152.0,AFKS,5.89,2023-07-25 17:00:00,2023-07-25,2023-07-25 16:00:00,2023-07-24
123622,2023-07-25 17:55:00,17.9,5665.0,AFKS,5.9,2023-07-25 17:00:00,2023-07-25,2023-07-25 17:00:00,2023-07-24
123623,2023-07-25 18:00:00,17.922,8182.0,AFKS,5.89,2023-07-25 18:00:00,2023-07-25,2023-07-25 17:00:00,2023-07-24


In [208]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour_index
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.0,2022-07-01 09:00:00
1,2022-07-01 10:00:00,56.4,13083.0,ABIO,3.93,2022-07-01 10:00:00
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,2022-07-01 11:00:00
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,2022-07-01 12:00:00
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,2022-07-01 13:00:00


In [210]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_day_index
0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30
1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01
2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92,2022-07-04
3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05
4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06


In [214]:
#link different time-step date

df_1hour.reset_index(inplace=True)
df_1hour = df_1hour.rename(columns={col : col+'_1hour' for col in df_1hour.columns if col not in ['date_hour_index', 'ticker']})
df = df_5min.merge(df_1hour, on=['date_hour_index', 'ticker'], how='left')

df_1day.reset_index(inplace=True)
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['date_day_index', 'ticker']})
df = df.merge(df_1day, on=['date_day_index', 'ticker'], how='left')

#Проверка что не наджоинилось лишнего
assert df_5min.shape[0] == df.shape[0], 'Error: with join dimensions'

In [215]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_day_index,index_1hour,time_1hour,close_1hour,volume_1hour,tmos_close_1hour,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.00,2022-07-01 09:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
2,2022-07-01 10:05:00,56.50,3630.0,ABIO,3.89,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7397124,2025-02-24 23:25:00,4668.50,511.0,YDEX,7.30,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397125,2025-02-24 23:30:00,4668.50,1154.0,YDEX,7.31,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397126,2025-02-24 23:35:00,4669.00,1561.0,YDEX,7.32,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397127,2025-02-24 23:40:00,4664.50,1349.0,YDEX,7.33,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21


In [220]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): {df.loc[df['close_1hour'].isnull(), ['ticker', 'date_hour']].groupby(['ticker', 'date_hour']).count().shape[0]}  акций-часов')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): {df.loc[df['close_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): 11  акций-часов
Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): 1458  акций-дней


In [222]:
df['close_1hour'].isnull().mean(), df['close_1day'].isnull().mean()

(1.4059508763467557e-05, 0.0223448854278464)

In [226]:
df['ticker'].nunique()

76

In [228]:
1027 / 77 # в среднем на акцию пропущено дней

13.337662337662337

In [232]:
#Прикол в том, что в эти праздничные дни 5-мин, 1-час данные есть, а 1-дневных нет
df.loc[df['close_1day'].isnull(), 'time'].dt.date.value_counts()

time
2025-01-08    11876
2025-01-03    11762
2024-11-05    11722
2024-05-02    11508
2024-06-13    11458
2024-05-10    11450
2025-01-02    10321
2024-03-11     9567
2024-02-26     9565
2024-01-03     9028
2023-06-13     7907
2022-07-01     7566
2023-03-09     7450
2023-05-02     7353
2023-05-10     7197
2023-02-24     6973
2023-01-03     6959
2022-11-07     5227
2024-06-12       64
2024-12-31       61
2025-01-07       60
2024-11-04       54
2024-05-01       49
2024-05-09       46
2022-12-14       44
2023-06-12        4
2023-05-09        3
2023-03-08        3
2023-05-01        3
2023-01-02        2
2024-03-08        2
2023-02-23        2
2024-02-23        1
2022-11-04        1
Name: count, dtype: int64

In [235]:
#Это происходит из-за пропущенных дней в df_1day (праздников), поэтому протянем их ffill
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    cols_ffil = ['index_1day', 'index_1hour']
    df.loc[mask_stock, cols_ffil] = df.loc[mask_stock, cols_ffil].ffill()

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [00:13<00:00,  5.46it/s]


In [237]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): {df.loc[df['index_1hour'].isnull(), ['ticker', 'date_hour']].groupby(['ticker', 'date_hour']).count().shape[0]}  акций-часов')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): {df.loc[df['index_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1hour) к 15min (от 1hour): 11  акций-часов
Не подтянулсиь пар (ticker, time_1day) к 15min (от 1day): 76  акций-дней


In [242]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,date_hour,date,date_hour_index,date_day_index,index_1hour,time_1hour,close_1hour,volume_1hour,tmos_close_1hour,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.00,2022-07-01 09:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
2,2022-07-01 10:05:00,56.50,3630.0,ABIO,3.89,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93,2022-07-01 10:00:00,2022-07-01,2022-07-01 09:00:00,,0.0,2022-07-01 09:00:00,57.78,33.0,4.00,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7397124,2025-02-24 23:25:00,4668.50,511.0,YDEX,7.30,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397125,2025-02-24 23:30:00,4668.50,1154.0,YDEX,7.31,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397126,2025-02-24 23:35:00,4669.00,1561.0,YDEX,7.32,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
7397127,2025-02-24 23:40:00,4664.50,1349.0,YDEX,7.33,2025-02-24 23:00:00,2025-02-24,2025-02-24 22:00:00,2025-02-21,693948.0,2025-02-24 22:00:00,4666.00,13530.0,7.29,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21


In [245]:
df = df[['time', 'close', 'volume', 'ticker', 'tmos_close', 'index_1hour', 'index_1day']]
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.00,0.0,
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87,0.0,
2,2022-07-01 10:05:00,56.50,3630.0,ABIO,3.89,0.0,
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91,0.0,
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93,0.0,
...,...,...,...,...,...,...,...
7397124,2025-02-24 23:25:00,4668.50,511.0,YDEX,7.30,693948.0,50953.0
7397125,2025-02-24 23:30:00,4668.50,1154.0,YDEX,7.31,693948.0,50953.0
7397126,2025-02-24 23:35:00,4669.00,1561.0,YDEX,7.32,693948.0,50953.0
7397127,2025-02-24 23:40:00,4664.50,1349.0,YDEX,7.33,693948.0,50953.0


#### 2.4 Union target and features. Make data_file to train

In [251]:
(df_result['time'] == df['time']).all(), (df_result['close'] == df['close']).all()

(True, True)

In [252]:
(df.index.values == df_result.index.values).all()

True

In [255]:
df_result.head()

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 09:55:00,57.78,WIN,ABIO,0 days 00:05:00,1.012,57.02,1
1,1,2022-07-01 10:00:00,57.02,WIN,ABIO,0 days 00:10:00,1.012,56.22,3
2,2,2022-07-01 10:05:00,56.5,LOSE,ABIO,0 days 02:20:00,0.994,57.0,30
3,3,2022-07-01 10:10:00,56.22,LOSE,ABIO,0 days 02:10:00,0.994,56.74,29
4,4,2022-07-01 10:15:00,56.46,LOSE,ABIO,0 days 02:10:00,0.994,57.0,30


In [257]:
df_result.columns.tolist()

['ind',
 'time',
 'close',
 'result',
 'ticker',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind']

In [259]:
#union
df = pd.concat([df.reset_index(drop=True), df_result[['result', 'delta_time', 'income_rate', 'res_price', 'res_ind']].reset_index(drop=True)], axis=1)
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.00,0.0,,WIN,0 days 00:05:00,1.012000,57.02,1
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87,0.0,,WIN,0 days 00:10:00,1.012000,56.22,3
2,2022-07-01 10:05:00,56.50,3630.0,ABIO,3.89,0.0,,LOSE,0 days 02:20:00,0.994000,57.00,30
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91,0.0,,LOSE,0 days 02:10:00,0.994000,56.74,29
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93,0.0,,LOSE,0 days 02:10:00,0.994000,57.00,30
...,...,...,...,...,...,...,...,...,...,...,...,...
7397124,2025-02-24 23:25:00,4668.50,511.0,YDEX,7.30,693948.0,50953.0,DNF,0 days 00:20:00,0.998464,4671.00,7397128
7397125,2025-02-24 23:30:00,4668.50,1154.0,YDEX,7.31,693948.0,50953.0,DNF,0 days 00:15:00,0.998464,4671.00,7397128
7397126,2025-02-24 23:35:00,4669.00,1561.0,YDEX,7.32,693948.0,50953.0,DNF,0 days 00:10:00,0.998572,4671.00,7397128
7397127,2025-02-24 23:40:00,4664.50,1349.0,YDEX,7.33,693948.0,50953.0,DNF,0 days 00:05:00,0.997606,4671.00,7397128


### 2.5 Feature engineering

In [265]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-07-01 09:55:00,57.78,33.0,ABIO,4.00,0.0,,WIN,0 days 00:05:00,1.012000,57.02,1
1,2022-07-01 10:00:00,57.02,747.0,ABIO,3.87,0.0,,WIN,0 days 00:10:00,1.012000,56.22,3
2,2022-07-01 10:05:00,56.50,3630.0,ABIO,3.89,0.0,,LOSE,0 days 02:20:00,0.994000,57.00,30
3,2022-07-01 10:10:00,56.22,1551.0,ABIO,3.91,0.0,,LOSE,0 days 02:10:00,0.994000,56.74,29
4,2022-07-01 10:15:00,56.46,1938.0,ABIO,3.93,0.0,,LOSE,0 days 02:10:00,0.994000,57.00,30
...,...,...,...,...,...,...,...,...,...,...,...,...
7397124,2025-02-24 23:25:00,4668.50,511.0,YDEX,7.30,693948.0,50953.0,DNF,0 days 00:20:00,0.998464,4671.00,7397128
7397125,2025-02-24 23:30:00,4668.50,1154.0,YDEX,7.31,693948.0,50953.0,DNF,0 days 00:15:00,0.998464,4671.00,7397128
7397126,2025-02-24 23:35:00,4669.00,1561.0,YDEX,7.32,693948.0,50953.0,DNF,0 days 00:10:00,0.998572,4671.00,7397128
7397127,2025-02-24 23:40:00,4664.50,1349.0,YDEX,7.33,693948.0,50953.0,DNF,0 days 00:05:00,0.997606,4671.00,7397128


In [267]:
df_1hour.head()

Unnamed: 0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,tmos_close_1hour,date_hour_index
0,0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.0,2022-07-01 09:00:00
1,1,2022-07-01 10:00:00,56.4,13083.0,ABIO,3.93,2022-07-01 10:00:00
2,2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,2022-07-01 11:00:00
3,3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,2022-07-01 12:00:00
4,4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,2022-07-01 13:00:00


In [269]:
df_1day.head()

Unnamed: 0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index
0,0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30
1,1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01
2,2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92,2022-07-04
3,3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05
4,4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06


In [273]:
from sklearn.linear_model import LinearRegression

def calculate_exp_ma(data, window):
    alpha = 2 / (window + 1)
    coeffs = ((1 - alpha)**(np.arange(window)[::-1])) * (alpha)
    coeffs[0] /= alpha
    
    return data.rolling(window=window, min_periods=window).apply(lambda x: (x*coeffs).sum()).values


def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=window).mean().values
    rolling_std = data.rolling(window=window, min_periods=window).std().values
    norm_rolling_std = rolling_std / (rolling_mean + np.finfo(np.float32).eps)

    num_of_std = 2
    lower_band_2std = rolling_mean - (rolling_std * num_of_std)
    upper_band_2std = rolling_mean + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = rolling_mean - (rolling_std * num_of_std)
    upper_band_3std = rolling_mean + (rolling_std * num_of_std)
    
    
    return rolling_mean, rolling_std, norm_rolling_std, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change"""
    roc = (data - data.shift(periods)) / (data.shift(periods)+np.finfo(np.float32).eps)
    return roc



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std, norm_rolling_std,\
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    #mean_abs_pct
    mean_abs_pct = calculate_roc(data, 1).rolling(window=window, min_periods=window).apply(lambda x: x.abs().mean()).values
        
    #alpha
    alpha = data.rolling(window=window, min_periods=window).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0]).values

    #min, max
    rolling_min = data.rolling(window=window, min_periods=window).min().values
    rolling_max = data.rolling(window=window, min_periods=window).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window).values
    # diff = data.diff(window).values

    #exp_ma
    exp_ma = calculate_exp_ma(data, window)
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_ma_low_2std' : lower_band_2std,
                        f'{feat_name}_ma_up_2std' : upper_band_2std,
                        f'{feat_name}_ma_low_3std' : lower_band_3std,
                        f'{feat_name}_ma_up_3std' : upper_band_3std, 

                        f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        # f'{feat_name}_diff' : diff,
                        f'{feat_name}_expma' : exp_ma,
                        }).astype(np.float32)
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : calculate_roc(data, 1).values,
                        # f'{feat_name}_diff' : data.diff(1).values,
                        }).astype(np.float32)

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))}).astype(np.float32)
    return df_levels


In [276]:
def calculate_features(df_ticker, postfix=None):
    dfs = [df_ticker]
    
    levels =      [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]
    levels_tmos =  [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]


    #w1
    df_close = calc_stats_diff_1(df_ticker[f'close{postfix}'], feat_name=f'close{postfix}_w1')
    df_volume = calc_stats_diff_1(df_ticker[f'volume{postfix}'], feat_name=f'volume{postfix}_w1')
    df_tmos_close = calc_stats_diff_1(df_ticker[f'tmos_close{postfix}'], feat_name=f'tmos_close{postfix}_w1')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], 'Error w1'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]

    #w5
    window = 5
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w10
    window = 10
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w20
    window = 20
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w30
    window = 30
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]
    
    #w60
    window = 60
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w120
    window = 120
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]

   
    df = pd.concat(dfs, axis=1)
    assert (df_ticker.shape[0] == df.shape[0]) and (df.shape[1] == sum([elem.shape[1] for elem in dfs])), 'Error concat'

    return df


In [280]:
np.mean(np.abs(((df['volume'].astype(np.float32) - df['volume']) / df['volume'])) < 0.00_00_01)

1.0

In [282]:
np.mean(np.abs(((df['close'].astype(np.float32) - df['close']) / df['close'])) < 0.00_00_01)

1.0

In [284]:
np.mean(np.abs(((df['tmos_close'].astype(np.float32) - df['tmos_close']) / df['tmos_close'])) < 0.00_00_01)

1.0

In [287]:
np.mean(np.abs(((df_1day['volume_1day'].astype(np.float32) - df_1day['volume_1day']) / df_1day['volume_1day'])) < 0.00_00_01)

1.0

#### 5min

In [324]:
df['close'] = df['close'].astype(np.float32)
df['volume'] = df['volume'].astype(np.float32)
df['tmos_close'] = df['tmos_close'].astype(np.float32)

In [96]:
# dfs = []
# for ticker in tqdm(df['ticker'].unique()):
#     mask = np.array(df['ticker'] == ticker)
#     df_ticker = df.loc[mask].copy().reset_index()

#     df_ticker_fe = calculate_features(df_ticker, postfix='')
#
#     dump_pkl(df_ticker_fe.copy(), f'tmp/{ticker}.pkl')
    
#     dfs += [df_ticker_fe.copy()]

# df_fe = pd.concat(dfs).set_index('index')

100%|██████████████████████████████████████████████████████████████████████| 20/20 [3:46:42<00:00, 680.12s/it]


In [334]:
from joblib import Parallel, delayed
import multiprocessing

def parallel_worker(df_ticker):
    df_ticker_fe = calculate_features(df_ticker, postfix='')
    
    ticker = df_ticker_fe['ticker'].iloc[0]
    dump_pkl(df_ticker_fe, f'tmp/{ticker}.pkl')
    
    return True

num_cores = 6
res = Parallel(n_jobs=num_cores)(delayed(parallel_worker)(df.loc[df['ticker'] == ticker].reset_index()) for ticker in tqdm(df['ticker'].unique()))
res



  0%|                                                                                  | 0/76 [00:00<?, ?it/s][A[A

  1%|▉                                                                         | 1/76 [00:00<00:12,  6.06it/s][A[A

  3%|█▉                                                                        | 2/76 [00:00<00:12,  6.07it/s][A[A

  4%|██▉                                                                       | 3/76 [00:00<00:11,  6.09it/s][A[A

  5%|███▉                                                                      | 4/76 [00:00<00:11,  6.11it/s][A[A

  7%|████▊                                                                     | 5/76 [00:00<00:11,  6.15it/s][A[A

  8%|█████▊                                                                    | 6/76 [00:01<00:11,  5.85it/s][A[A

  9%|██████▊                                                                   | 7/76 [00:01<00:13,  5.22it/s][A[A

 11%|███████▊                                         

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [336]:
load_pkl(f'tmp/YDEX.pkl')

Unnamed: 0,index,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,...,tmos_close_w120_lvl_1.01-1.02,tmos_close_w120_lvl_-0.98-0.99,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98,tmos_close_w120_lvl_1.03-1.04,tmos_close_w120_lvl_-0.96-0.97,tmos_close_w120_lvl_1.04-1.05,tmos_close_w120_lvl_-0.95-0.96,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95
0,7292332,2022-07-01 09:55:00,1595.0,253.0,YDEX,4.00,684352.0,,WIN,0 days 00:05:00,...,,,,,,,,,,
1,7292333,2022-07-01 10:00:00,1563.0,7070.0,YDEX,3.87,684352.0,,LOSE,0 days 00:05:00,...,,,,,,,,,,
2,7292334,2022-07-01 10:05:00,1572.0,5934.0,YDEX,3.89,684352.0,,LOSE,0 days 00:05:00,...,,,,,,,,,,
3,7292335,2022-07-01 10:10:00,1582.5,5248.0,YDEX,3.91,684352.0,,WIN,0 days 00:25:00,...,,,,,,,,,,
4,7292336,2022-07-01 10:15:00,1582.5,6031.0,YDEX,3.93,684352.0,,WIN,0 days 00:20:00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104792,7397124,2025-02-24 23:25:00,4668.5,511.0,YDEX,7.30,693948.0,50953.0,DNF,0 days 00:20:00,...,0.0,41.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104793,7397125,2025-02-24 23:30:00,4668.5,1154.0,YDEX,7.31,693948.0,50953.0,DNF,0 days 00:15:00,...,0.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104794,7397126,2025-02-24 23:35:00,4669.0,1561.0,YDEX,7.32,693948.0,50953.0,DNF,0 days 00:10:00,...,0.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104795,7397127,2025-02-24 23:40:00,4664.5,1349.0,YDEX,7.33,693948.0,50953.0,DNF,0 days 00:05:00,...,0.0,84.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# 7.4 млн 5-минутных данных

#### #subsample

In [None]:
Переделать index

In [358]:
coeff_subsample = 0.4

NEED_POINTS = 125 #чтоб точно хватило

dfs = []
for ticker in stocks:
    df_ticker = load_pkl(f'tmp/{ticker}.pkl') ####################### вот здесь переделать index
    print(ticker, df_ticker.shape[0], '->', end=' ')
    
    #1. date available
    count_days = df_ticker['time'].dt.date.nunique()
    mask_avbl = np.zeros(df_ticker.shape[0]).astype(bool)
    if count_days > NEED_POINTS:
        date_first_avbl =  np.sort(df_ticker['time'].dt.date.unique())[NEED_POINTS]
        #print(ticker, date_first_avbl)
        mask_avbl = np.array(df_ticker["time"] >= pd.to_datetime(date_first_avbl))


    #2. subsample
    inds = np.arange(df_ticker.shape[0]).astype(int)[mask_avbl]
    inds_subsample = np.random.RandomState(seed=42).permutation(inds)[:int(len(inds)*coeff_subsample)]

    mask_subsample = np.zeros(df_ticker.shape[0]).astype(bool)
    mask_subsample[inds_subsample] = True
    print(mask_subsample.sum())

    dfs += [df_ticker[mask_subsample].copy()]
print('\nUNION_SHAPE: ', sum([elem.shape[0] for elem in dfs]))

ABIO 81646 -> 27483
AFKS 109101 -> 36673
AFLT 109441 -> 36792
ALRS 109425 -> 36786
APTK 79342 -> 26530
AQUA 86241 -> 29237
BANE 80456 -> 27394
BANEP 85776 -> 29040
BELU 100448 -> 34892
BSPB 90887 -> 31122
CBOM 104207 -> 34962
CHMF 109416 -> 36782
ENPG 108063 -> 36283
FEES 105518 -> 35299
FESH 100809 -> 35073
FLOT 101446 -> 35280
GAZP 109453 -> 36801
GMKN 108576 -> 36452
GTRK 85086 -> 30243
HEAD 78155 -> 27044
HYDR 108482 -> 36401
IRAO 109328 -> 36716
IRKT 82515 -> 28214
KMAZ 85141 -> 28867
LENT 78593 -> 26324
LIFE 75277 -> 25160
LKOH 109455 -> 36798
LSRG 78847 -> 26289
MAGN 109443 -> 36797
MDMG 77828 -> 25868
MGNT 109240 -> 36718
MOEX 109432 -> 36790
MTLR 102225 -> 35602
MTLRP 100882 -> 35080
MTSS 109319 -> 36751
MVID 102097 -> 35550
NLMK 109382 -> 36770
NMTP 90072 -> 30795
NVTK 109349 -> 36755
OGKB 99997 -> 34699
PHOR 108683 -> 36493
PIKK 109078 -> 36654
PLZL 109002 -> 36618
POSI 106187 -> 36646
RASP 84722 -> 28617
RENI 76262 -> 26035
RNFT 91991 -> 31514
ROSN 109464 -> 36801
RTKM 1088

In [360]:
df = pd.concat(dfs)
df.reset_index(inplace=True, drop=True)

In [362]:
df.shape

(2514148, 260)

In [None]:
Почему интересно 12939 - c такого высокого index начинается???

In [366]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,...,tmos_close_w120_lvl_1.01-1.02,tmos_close_w120_lvl_-0.98-0.99,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98,tmos_close_w120_lvl_1.03-1.04,tmos_close_w120_lvl_-0.96-0.97,tmos_close_w120_lvl_1.04-1.05,tmos_close_w120_lvl_-0.95-0.96,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95
0,12939,2022-12-26 10:00:00,63.02,4791.0,ABIO,4.07,1239.0,125.0,WIN,0 days 01:05:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12941,2022-12-26 10:10:00,62.68,411.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:10:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,12942,2022-12-26 10:15:00,62.5,2205.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:05:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12944,2022-12-26 10:25:00,62.740002,786.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:55:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12945,2022-12-26 10:30:00,62.52,2166.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:50:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [372]:

df['ticker'].nunique(), df['ticker'].value_counts()

(76,
 ticker
 ROSN    36801
 GAZP    36801
 LKOH    36798
 TATN    36798
 MAGN    36797
         ...  
 SFIN    25275
 LIFE    25160
 TGKN    24508
 WUSH    24286
 VSMO    21443
 Name: count, Length: 76, dtype: int64)

In [374]:
dump_pkl(df, './data/feat_engin/df_fe.pkl')

In [5]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'

100%|██████████████████████████████████████████████████████████████████████| 260/260 [00:00<00:00, 759.21it/s]


#### 1hour

In [293]:
df_1hour['close_1hour'] = df_1hour['close_1hour'].astype(np.float32)
df_1hour['volume_1hour'] = df_1hour['volume_1hour'].astype(np.float32)
df_1hour['tmos_close_1hour'] = df_1hour['tmos_close_1hour'].astype(np.float32)

In [295]:
dfs = []
for ticker in tqdm(df_1hour['ticker'].unique()):
    mask = np.array(df_1hour['ticker'] == ticker)
    df_ticker = df_1hour.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1hour')
    
    dfs += [df_ticker_fe.copy()]

df_1hour_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [29:28<00:00, 23.27s/it]


In [296]:
(df_1hour_fe['close_1hour'] == df_1hour['close_1hour']).all(), (df_1hour_fe.index.values == df_1hour_fe['index_1hour'].values).all()

(True, True)

In [297]:
df_1hour_fe

Unnamed: 0_level_0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,tmos_close_1hour,date_hour_index,close_1hour_w1_roc,volume_1hour_w1_roc,tmos_close_1hour_w1_roc,...,tmos_close_1hour_w120_lvl_1.01-1.02,tmos_close_1hour_w120_lvl_-0.98-0.99,tmos_close_1hour_w120_lvl_1.02-1.03,tmos_close_1hour_w120_lvl_-0.97-0.98,tmos_close_1hour_w120_lvl_1.03-1.04,tmos_close_1hour_w120_lvl_-0.96-0.97,tmos_close_1hour_w120_lvl_1.04-1.05,tmos_close_1hour_w120_lvl_-0.95-0.96,tmos_close_1hour_w120_lvl_1.05-1.07,tmos_close_1hour_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 09:00:00,57.779999,33.0,ABIO,4.00,2022-07-01 09:00:00,,,,...,,,,,,,,,,
1,1,2022-07-01 10:00:00,56.400002,13083.0,ABIO,3.93,2022-07-01 10:00:00,-0.023884,395.454559,-0.017500,...,,,,,,,,,,
2,2,2022-07-01 11:00:00,56.360001,6195.0,ABIO,3.92,2022-07-01 11:00:00,-0.000709,-0.526485,-0.002545,...,,,,,,,,,,
3,3,2022-07-01 12:00:00,56.919998,7632.0,ABIO,3.97,2022-07-01 12:00:00,0.009936,0.231961,0.012755,...,,,,,,,,,,
4,4,2022-07-01 13:00:00,56.180000,8748.0,ABIO,3.99,2022-07-01 13:00:00,-0.013001,0.146226,0.005038,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693945,693945,2025-02-24 19:00:00,4645.500000,26140.0,YDEX,7.25,2025-02-24 19:00:00,-0.003112,-0.552075,0.001381,...,1.0,24.0,0.0,6.0,0.0,19.0,0.0,4.0,0.0,0.0
693946,693946,2025-02-24 20:00:00,4658.000000,19307.0,YDEX,7.27,2025-02-24 20:00:00,0.002691,-0.261400,0.002759,...,0.0,30.0,0.0,9.0,0.0,18.0,0.0,4.0,0.0,2.0
693947,693947,2025-02-24 21:00:00,4660.000000,27796.0,YDEX,7.29,2025-02-24 21:00:00,0.000429,0.439685,0.002751,...,0.0,40.0,0.0,10.0,0.0,16.0,0.0,8.0,0.0,2.0
693948,693948,2025-02-24 22:00:00,4666.000000,13530.0,YDEX,7.29,2025-02-24 22:00:00,0.001288,-0.513239,0.000000,...,0.0,40.0,0.0,10.0,0.0,16.0,0.0,8.0,0.0,2.0


In [298]:
# pd.set_option('display.max_rows', 300)
# df_1hour_fe.dtypes

In [339]:
dump_pkl(df_1hour_fe, './data/feat_engin/df_1hour_fe.pkl')

#### 1day

In [301]:
df_1day['close_1day'] = df_1day['close_1day'].astype(np.float32)
df_1day['volume_1day'] = df_1day['volume_1day'].astype(np.float32)
df_1day['tmos_close_1day'] = df_1day['tmos_close_1day'].astype(np.float32)

In [302]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1day')
    
    dfs += [df_ticker_fe.copy()]

df_1day_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [02:03<00:00,  1.62s/it]


In [303]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index.values == df_1day_fe['index_1day'].values).all()

(True, True)

In [304]:
df_1day_fe

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index,close_1day_w1_roc,volume_1day_w1_roc,tmos_close_1day_w1_roc,...,tmos_close_1day_w120_lvl_1.01-1.02,tmos_close_1day_w120_lvl_-0.98-0.99,tmos_close_1day_w120_lvl_1.02-1.03,tmos_close_1day_w120_lvl_-0.97-0.98,tmos_close_1day_w120_lvl_1.03-1.04,tmos_close_1day_w120_lvl_-0.96-0.97,tmos_close_1day_w120_lvl_1.04-1.05,tmos_close_1day_w120_lvl_-0.95-0.96,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-06-30 03:00:00,57.820000,111300.0,ABIO,4.02,2022-06-30,,,,...,,,,,,,,,,
1,1,2022-07-01 03:00:00,56.279999,48126.0,ABIO,3.96,2022-07-01,-0.026634,-0.567601,-0.014925,...,,,,,,,,,,
2,2,2022-07-04 03:00:00,56.900002,58944.0,ABIO,3.92,2022-07-04,0.011016,0.224785,-0.010101,...,,,,,,,,,,
3,3,2022-07-05 03:00:00,56.380001,39756.0,ABIO,3.97,2022-07-05,-0.009139,-0.325529,0.012755,...,,,,,,,,,,
4,4,2022-07-06 03:00:00,60.680000,275700.0,ABIO,3.96,2022-07-06,0.076268,5.934803,-0.002519,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50950,50950,2025-02-18 03:00:00,4550.000000,1392270.0,YDEX,7.12,2025-02-18,-0.033765,0.150435,-0.029973,...,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
50951,50951,2025-02-19 03:00:00,4646.000000,1127973.0,YDEX,7.22,2025-02-19,0.021099,-0.189832,0.014045,...,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0
50952,50952,2025-02-20 03:00:00,4649.000000,1692660.0,YDEX,7.22,2025-02-20,0.000646,0.500621,0.000000,...,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0
50953,50953,2025-02-21 03:00:00,4667.000000,685047.0,YDEX,7.21,2025-02-21,0.003872,-0.595284,-0.001385,...,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [321]:
dump_pkl(df_1day_fe, './data/feat_engin/df_1day_fe.pkl')

# #Load data

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import gc

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [2]:
df_fe = load_pkl('./data/feat_engin/df_fe.pkl')
df_1hour_fe = load_pkl('./data/feat_engin/df_1hour_fe.pkl')
df_1day_fe = load_pkl('./data/feat_engin/df_1day_fe.pkl')

df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape, 

((2514148, 260), (693950, 254), (50955, 254))

### time features

In [7]:
#hour
df_fe['hour'] = df_fe['time'].dt.hour

#day
df_fe['day'] = df_fe['time'].dt.day

#day_of_week
df_fe['weekday'] = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4

#month
#df_fe['month'] = df_fe['time'].dt.month


#hour
# time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
# df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
# df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
# day_of_week_cyclic = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4
# df_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
# df_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
# day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
# df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
# df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


In [10]:
df_fe.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,...,tmos_close_w120_lvl_-0.97-0.98,tmos_close_w120_lvl_1.03-1.04,tmos_close_w120_lvl_-0.96-0.97,tmos_close_w120_lvl_1.04-1.05,tmos_close_w120_lvl_-0.95-0.96,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95,hour,day,weekday
0,12939,2022-12-26 10:00:00,63.02,4791.0,ABIO,4.07,1239.0,125.0,WIN,0 days 01:05:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,26,0.0
1,12941,2022-12-26 10:10:00,62.68,411.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:10:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,26,0.0
2,12942,2022-12-26 10:15:00,62.5,2205.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:05:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,26,0.0
3,12944,2022-12-26 10:25:00,62.740002,786.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:55:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,26,0.0
4,12945,2022-12-26 10:30:00,62.52,2166.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:50:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,26,0.0


In [12]:
df_1hour_fe.head()

Unnamed: 0_level_0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,tmos_close_1hour,date_hour_index,close_1hour_w1_roc,volume_1hour_w1_roc,tmos_close_1hour_w1_roc,...,tmos_close_1hour_w120_lvl_1.01-1.02,tmos_close_1hour_w120_lvl_-0.98-0.99,tmos_close_1hour_w120_lvl_1.02-1.03,tmos_close_1hour_w120_lvl_-0.97-0.98,tmos_close_1hour_w120_lvl_1.03-1.04,tmos_close_1hour_w120_lvl_-0.96-0.97,tmos_close_1hour_w120_lvl_1.04-1.05,tmos_close_1hour_w120_lvl_-0.95-0.96,tmos_close_1hour_w120_lvl_1.05-1.07,tmos_close_1hour_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 09:00:00,57.779999,33.0,ABIO,4.0,2022-07-01 09:00:00,,,,...,,,,,,,,,,
1,1,2022-07-01 10:00:00,56.400002,13083.0,ABIO,3.93,2022-07-01 10:00:00,-0.023884,395.454559,-0.0175,...,,,,,,,,,,
2,2,2022-07-01 11:00:00,56.360001,6195.0,ABIO,3.92,2022-07-01 11:00:00,-0.000709,-0.526485,-0.002545,...,,,,,,,,,,
3,3,2022-07-01 12:00:00,56.919998,7632.0,ABIO,3.97,2022-07-01 12:00:00,0.009936,0.231961,0.012755,...,,,,,,,,,,
4,4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,2022-07-01 13:00:00,-0.013001,0.146226,0.005038,...,,,,,,,,,,


In [15]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index,close_1day_w1_roc,volume_1day_w1_roc,tmos_close_1day_w1_roc,...,tmos_close_1day_w120_lvl_1.01-1.02,tmos_close_1day_w120_lvl_-0.98-0.99,tmos_close_1day_w120_lvl_1.02-1.03,tmos_close_1day_w120_lvl_-0.97-0.98,tmos_close_1day_w120_lvl_1.03-1.04,tmos_close_1day_w120_lvl_-0.96-0.97,tmos_close_1day_w120_lvl_1.04-1.05,tmos_close_1day_w120_lvl_-0.95-0.96,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30,,,,...,,,,,,,,,,
1,1,2022-07-01 03:00:00,56.279999,48126.0,ABIO,3.96,2022-07-01,-0.026634,-0.567601,-0.014925,...,,,,,,,,,,
2,2,2022-07-04 03:00:00,56.900002,58944.0,ABIO,3.92,2022-07-04,0.011016,0.224785,-0.010101,...,,,,,,,,,,
3,3,2022-07-05 03:00:00,56.380001,39756.0,ABIO,3.97,2022-07-05,-0.009139,-0.325529,0.012755,...,,,,,,,,,,
4,4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06,0.076268,5.934803,-0.002519,...,,,,,,,,,,


### resize memory

#### #resize dtype

In [21]:
[elem for elem in df_fe.columns if 'ind' in elem]

['index', 'index_1hour', 'index_1day', 'res_ind']

In [23]:
cols = [elem for elem in df_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)

 68%|███████████████████████████████████████████████▎                      | 175/259 [00:00<00:00, 874.54it/s]

time
ticker
result
delta_time


100%|██████████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 882.93it/s]


In [27]:
[elem for elem in df_1hour_fe.columns if 'ind' in elem]

['index_1hour', 'date_hour_index']

In [29]:
cols = [elem for elem in df_1hour_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1hour_fe[col] = df_1hour_fe[col].astype(np.float32)
    except:
        print(col)

100%|█████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 2111.91it/s]

time_1hour
ticker





In [32]:
[elem for elem in df_1day_fe.columns if 'ind' in elem]

['index_1day', 'date_day_index']

In [34]:
cols = [elem for elem in df_1day_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

100%|█████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 7785.71it/s]

time_1day
ticker





### Delete useless (dublicated) columns


In [39]:
#del df_1hour_fe['date_hour_index']

In [41]:
#del df_1day_fe['date_day_index']

### Absolute value columns

In [36]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False



In [38]:
cols_del_5min = [elem for elem in df_fe.columns if flag_delete(elem)]
len(cols_del_5min), cols_del_5min

(126,
 ['close_w5_ma',
  'close_w5_std',
  'close_w5_ma_low_2std',
  'close_w5_ma_up_2std',
  'close_w5_ma_low_3std',
  'close_w5_ma_up_3std',
  'close_w5_min',
  'close_w5_max',
  'close_w5_expma',
  'volume_w5_ma',
  'volume_w5_std',
  'volume_w5_ma_low_2std',
  'volume_w5_ma_up_2std',
  'volume_w5_ma_low_3std',
  'volume_w5_ma_up_3std',
  'volume_w5_min',
  'volume_w5_max',
  'volume_w5_expma',
  'tmos_close_w5_ma',
  'tmos_close_w5_std',
  'tmos_close_w5_ma_low_2std',
  'tmos_close_w5_ma_up_2std',
  'tmos_close_w5_ma_low_3std',
  'tmos_close_w5_ma_up_3std',
  'tmos_close_w5_min',
  'tmos_close_w5_max',
  'tmos_close_w5_expma',
  'close_w10_ma',
  'close_w10_std',
  'close_w10_ma_low_2std',
  'close_w10_ma_up_2std',
  'close_w10_ma_low_3std',
  'close_w10_ma_up_3std',
  'close_w10_min',
  'close_w10_max',
  'close_w10_expma',
  'volume_w10_ma',
  'volume_w10_std',
  'volume_w10_ma_low_2std',
  'volume_w10_ma_up_2std',
  'volume_w10_ma_low_3std',
  'volume_w10_ma_up_3std',
  'volume_

In [40]:
cols_del_1hour = [elem for elem in df_1hour_fe.columns if flag_delete(elem)]
len(cols_del_1hour), cols_del_1hour

(126,
 ['close_1hour_w5_ma',
  'close_1hour_w5_std',
  'close_1hour_w5_ma_low_2std',
  'close_1hour_w5_ma_up_2std',
  'close_1hour_w5_ma_low_3std',
  'close_1hour_w5_ma_up_3std',
  'close_1hour_w5_min',
  'close_1hour_w5_max',
  'close_1hour_w5_expma',
  'volume_1hour_w5_ma',
  'volume_1hour_w5_std',
  'volume_1hour_w5_ma_low_2std',
  'volume_1hour_w5_ma_up_2std',
  'volume_1hour_w5_ma_low_3std',
  'volume_1hour_w5_ma_up_3std',
  'volume_1hour_w5_min',
  'volume_1hour_w5_max',
  'volume_1hour_w5_expma',
  'tmos_close_1hour_w5_ma',
  'tmos_close_1hour_w5_std',
  'tmos_close_1hour_w5_ma_low_2std',
  'tmos_close_1hour_w5_ma_up_2std',
  'tmos_close_1hour_w5_ma_low_3std',
  'tmos_close_1hour_w5_ma_up_3std',
  'tmos_close_1hour_w5_min',
  'tmos_close_1hour_w5_max',
  'tmos_close_1hour_w5_expma',
  'close_1hour_w10_ma',
  'close_1hour_w10_std',
  'close_1hour_w10_ma_low_2std',
  'close_1hour_w10_ma_up_2std',
  'close_1hour_w10_ma_low_3std',
  'close_1hour_w10_ma_up_3std',
  'close_1hour_w10_m

In [42]:
cols_del_1day = [elem for elem in df_1day_fe.columns if flag_delete(elem)]
len(cols_del_1day), cols_del_1day

(126,
 ['close_1day_w5_ma',
  'close_1day_w5_std',
  'close_1day_w5_ma_low_2std',
  'close_1day_w5_ma_up_2std',
  'close_1day_w5_ma_low_3std',
  'close_1day_w5_ma_up_3std',
  'close_1day_w5_min',
  'close_1day_w5_max',
  'close_1day_w5_expma',
  'volume_1day_w5_ma',
  'volume_1day_w5_std',
  'volume_1day_w5_ma_low_2std',
  'volume_1day_w5_ma_up_2std',
  'volume_1day_w5_ma_low_3std',
  'volume_1day_w5_ma_up_3std',
  'volume_1day_w5_min',
  'volume_1day_w5_max',
  'volume_1day_w5_expma',
  'tmos_close_1day_w5_ma',
  'tmos_close_1day_w5_std',
  'tmos_close_1day_w5_ma_low_2std',
  'tmos_close_1day_w5_ma_up_2std',
  'tmos_close_1day_w5_ma_low_3std',
  'tmos_close_1day_w5_ma_up_3std',
  'tmos_close_1day_w5_min',
  'tmos_close_1day_w5_max',
  'tmos_close_1day_w5_expma',
  'close_1day_w10_ma',
  'close_1day_w10_std',
  'close_1day_w10_ma_low_2std',
  'close_1day_w10_ma_up_2std',
  'close_1day_w10_ma_low_3std',
  'close_1day_w10_ma_up_3std',
  'close_1day_w10_min',
  'close_1day_w10_max',
  'cl

## Concat

In [48]:
df_fe.columns.tolist()

['index',
 'time',
 'close',
 'volume',
 'ticker',
 'tmos_close',
 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'tmos_close_w1_roc',
 'close_w5_ma',
 'close_w5_std',
 'close_w5_norm_std',
 'close_w5_ma_low_2std',
 'close_w5_ma_up_2std',
 'close_w5_ma_low_3std',
 'close_w5_ma_up_3std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_min',
 'close_w5_max',
 'close_w5_rsi',
 'close_w5_roc',
 'close_w5_expma',
 'volume_w5_ma',
 'volume_w5_std',
 'volume_w5_norm_std',
 'volume_w5_ma_low_2std',
 'volume_w5_ma_up_2std',
 'volume_w5_ma_low_3std',
 'volume_w5_ma_up_3std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_min',
 'volume_w5_max',
 'volume_w5_rsi',
 'volume_w5_roc',
 'volume_w5_expma',
 'tmos_close_w5_ma',
 'tmos_close_w5_std',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_ma_low_2std',
 'tmos_close_w5_ma_up_2std',
 'tmos_close_w5_ma_low_3std',
 'tmos_close_w5_ma_up_3std',
 'tmos

In [50]:
df_1hour_fe.columns.tolist()

['index_1hour',
 'time_1hour',
 'close_1hour',
 'volume_1hour',
 'ticker',
 'tmos_close_1hour',
 'date_hour_index',
 'close_1hour_w1_roc',
 'volume_1hour_w1_roc',
 'tmos_close_1hour_w1_roc',
 'close_1hour_w5_ma',
 'close_1hour_w5_std',
 'close_1hour_w5_norm_std',
 'close_1hour_w5_ma_low_2std',
 'close_1hour_w5_ma_up_2std',
 'close_1hour_w5_ma_low_3std',
 'close_1hour_w5_ma_up_3std',
 'close_1hour_w5_mean_abs_pct',
 'close_1hour_w5_alpha',
 'close_1hour_w5_min',
 'close_1hour_w5_max',
 'close_1hour_w5_rsi',
 'close_1hour_w5_roc',
 'close_1hour_w5_expma',
 'volume_1hour_w5_ma',
 'volume_1hour_w5_std',
 'volume_1hour_w5_norm_std',
 'volume_1hour_w5_ma_low_2std',
 'volume_1hour_w5_ma_up_2std',
 'volume_1hour_w5_ma_low_3std',
 'volume_1hour_w5_ma_up_3std',
 'volume_1hour_w5_mean_abs_pct',
 'volume_1hour_w5_alpha',
 'volume_1hour_w5_min',
 'volume_1hour_w5_max',
 'volume_1hour_w5_rsi',
 'volume_1hour_w5_roc',
 'volume_1hour_w5_expma',
 'tmos_close_1hour_w5_ma',
 'tmos_close_1hour_w5_std',
 '

In [52]:
df_1day_fe.columns.tolist()

['index_1day',
 'time_1day',
 'close_1day',
 'volume_1day',
 'ticker',
 'tmos_close_1day',
 'date_day_index',
 'close_1day_w1_roc',
 'volume_1day_w1_roc',
 'tmos_close_1day_w1_roc',
 'close_1day_w5_ma',
 'close_1day_w5_std',
 'close_1day_w5_norm_std',
 'close_1day_w5_ma_low_2std',
 'close_1day_w5_ma_up_2std',
 'close_1day_w5_ma_low_3std',
 'close_1day_w5_ma_up_3std',
 'close_1day_w5_mean_abs_pct',
 'close_1day_w5_alpha',
 'close_1day_w5_min',
 'close_1day_w5_max',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'close_1day_w5_expma',
 'volume_1day_w5_ma',
 'volume_1day_w5_std',
 'volume_1day_w5_norm_std',
 'volume_1day_w5_ma_low_2std',
 'volume_1day_w5_ma_up_2std',
 'volume_1day_w5_ma_low_3std',
 'volume_1day_w5_ma_up_3std',
 'volume_1day_w5_mean_abs_pct',
 'volume_1day_w5_alpha',
 'volume_1day_w5_min',
 'volume_1day_w5_max',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'volume_1day_w5_expma',
 'tmos_close_1day_w5_ma',
 'tmos_close_1day_w5_std',
 'tmos_close_1day_w5_norm_std',
 'tmos_cl

In [55]:
df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape

((2514148, 263), (693950, 254), (50955, 254))

In [57]:
df = df_fe.merge(df_1hour_fe, on=['index_1hour', 'ticker'], how='left')
df.shape

(2514148, 515)

In [58]:
df = df.merge(df_1day_fe, on=['index_1day', 'ticker'], how='left')
df.shape

(2514148, 767)

In [59]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,...,tmos_close_1day_w120_lvl_1.01-1.02,tmos_close_1day_w120_lvl_-0.98-0.99,tmos_close_1day_w120_lvl_1.02-1.03,tmos_close_1day_w120_lvl_-0.97-0.98,tmos_close_1day_w120_lvl_1.03-1.04,tmos_close_1day_w120_lvl_-0.96-0.97,tmos_close_1day_w120_lvl_1.04-1.05,tmos_close_1day_w120_lvl_-0.95-0.96,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95
0,12939,2022-12-26 10:00:00,63.02,4791.0,ABIO,4.07,1239.0,125.0,WIN,0 days 01:05:00,...,5.0,5.0,0.0,8.0,0.0,6.0,0.0,4.0,5.0,13.0
1,12941,2022-12-26 10:10:00,62.68,411.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:10:00,...,5.0,5.0,0.0,8.0,0.0,6.0,0.0,4.0,5.0,13.0
2,12942,2022-12-26 10:15:00,62.5,2205.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:05:00,...,5.0,5.0,0.0,8.0,0.0,6.0,0.0,4.0,5.0,13.0
3,12944,2022-12-26 10:25:00,62.740002,786.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:55:00,...,5.0,5.0,0.0,8.0,0.0,6.0,0.0,4.0,5.0,13.0
4,12945,2022-12-26 10:30:00,62.52,2166.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:50:00,...,5.0,5.0,0.0,8.0,0.0,6.0,0.0,4.0,5.0,13.0


In [62]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'
    # if df[col].isnull().sum() != 0:
    #     print(col, df[col].isnull().sum())

100%|██████████████████████████████████████████████████████████████████████| 767/767 [00:01<00:00, 441.81it/s]


### Relative features

In [66]:
groups_5min = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_w1_roc', 'close_w5_alpha', 'close_w10_alpha', 'close_w20_alpha', 'close_w30_alpha', 'close_w60_alpha', 'close_w120_alpha'],
         ['volume_w1_roc', 'volume_w5_alpha', 'volume_w10_alpha'],
         ['tmos_close_w1_roc', 'tmos_close_w5_alpha', 'tmos_close_w10_alpha', 'tmos_close_w20_alpha', 'tmos_close_w30_alpha', 'tmos_close_w60_alpha', 'tmos_close_w120_alpha'],
    
    
         ['close_w1_roc', 'close_w5_roc', 'close_w10_roc', 'close_w20_roc', 'close_w30_roc', 'close_w60_roc', 'close_w120_roc'],
         ['volume_w1_roc', 'volume_w5_roc', 'volume_w10_roc'],
         ['tmos_close_w1_roc', 'tmos_close_w5_roc', 'tmos_close_w10_roc', 'tmos_close_w20_roc', 'tmos_close_w30_roc', 'tmos_close_w60_roc', 'tmos_close_w120_roc'],

    
         ['close_w5_mean_abs_pct', 'close_w10_mean_abs_pct', 'close_w20_mean_abs_pct', 'close_w30_mean_abs_pct', 'close_w60_mean_abs_pct', 'close_w120_mean_abs_pct'],
         ['volume_w5_mean_abs_pct', 'volume_w10_mean_abs_pct'],
         ['tmos_close_w5_mean_abs_pct', 'tmos_close_w10_mean_abs_pct', 'tmos_close_w20_mean_abs_pct', 'tmos_close_w30_mean_abs_pct', 'tmos_close_w60_mean_abs_pct', 'tmos_close_w120_mean_abs_pct'],


         ['close_w5_std', 'close_w10_std', 'close_w20_std', 'close_w30_std', 'close_w60_std', 'close_w120_std'],
         ['volume_w5_std', 'volume_w10_std'],
         ['tmos_close_w5_std', 'tmos_close_w10_std', 'tmos_close_w20_std', 'tmos_close_w30_std', 'tmos_close_w60_std', 'tmos_close_w120_std'],


         ['close_w5_norm_std', 'close_w10_norm_std', 'close_w20_norm_std', 'close_w30_norm_std', 'close_w60_norm_std', 'close_w120_norm_std'],
         ['volume_w5_norm_std', 'volume_w10_norm_std'],
         ['tmos_close_w5_norm_std', 'tmos_close_w10_norm_std', 'tmos_close_w20_norm_std', 'tmos_close_w30_norm_std', 'tmos_close_w60_norm_std', 'tmos_close_w120_norm_std'],


         ['close_w5_rsi', 'close_w10_rsi', 'close_w20_rsi', 'close_w30_rsi', 'close_w60_rsi', 'close_w120_rsi'],
         ['volume_w5_rsi', 'volume_w10_rsi'],
         ['tmos_close_w5_rsi', 'tmos_close_w10_rsi', 'tmos_close_w20_rsi', 'tmos_close_w30_rsi', 'tmos_close_w60_rsi', 'tmos_close_w120_rsi'],


         ['close', 'close_w5_ma', 'close_w10_ma', 'close_w20_ma', 'close_w30_ma', 'close_w60_ma', 'close_w120_ma'],
         ['volume', 'volume_w5_ma', 'volume_w10_ma'],
         ['tmos_close', 'tmos_close_w5_ma', 'tmos_close_w10_ma', 'tmos_close_w20_ma', 'tmos_close_w30_ma', 'tmos_close_w60_ma', 'tmos_close_w120_ma'],


         ['close', 'close_w5_expma', 'close_w10_expma', 'close_w20_expma', 'close_w30_expma', 'close_w60_expma', 'close_w120_expma'],
         ['volume', 'volume_w5_expma', 'volume_w10_expma'],
         ['tmos_close', 'tmos_close_w5_expma', 'tmos_close_w10_expma', 'tmos_close_w20_expma', 'tmos_close_w30_expma', 'tmos_close_w60_expma', 'tmos_close_w120_expma'],


         {'close' : ['close_w5_min', 'close_w10_min', 'close_w20_min', 'close_w30_min', 'close_w60_min', 'close_w120_min']},
         {'volume' : ['volume_w5_min', 'volume_w10_min']},
         {'tmos_close' : ['tmos_close_w5_min', 'tmos_close_w10_min', 'tmos_close_w20_min', 'tmos_close_w30_min', 'tmos_close_w60_min', 'tmos_close_w120_min']},

    
         {'close' : ['close_w5_max', 'close_w10_max', 'close_w20_max', 'close_w30_max', 'close_w60_max', 'close_w120_max']},
         {'volume' : ['volume_w5_max', 'volume_w10_max']},
         {'tmos_close' : ['tmos_close_w5_max', 'tmos_close_w10_max', 'tmos_close_w20_max', 'tmos_close_w30_max', 'tmos_close_w60_max', 'tmos_close_w120_max']},



        #w5
         {'close' : [  'close_w5_ma_low_2std', 'close_w5_ma_up_2std', 'close_w5_ma_low_3std', 'close_w5_ma_up_3std']},
         {'volume' : [  'volume_w5_ma_low_2std', 'volume_w5_ma_up_2std', 'volume_w5_ma_low_3std', 'volume_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w5_ma_low_2std', 'tmos_close_w5_ma_up_2std', 'tmos_close_w5_ma_low_3std', 'tmos_close_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_w10_ma_low_2std', 'close_w10_ma_up_2std', 'close_w10_ma_low_3std', 'close_w10_ma_up_3std']},
         {'volume' : [  'volume_w10_ma_low_2std', 'volume_w10_ma_up_2std', 'volume_w10_ma_low_3std', 'volume_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w10_ma_low_2std', 'tmos_close_w10_ma_up_2std', 'tmos_close_w10_ma_low_3std', 'tmos_close_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_w20_ma_low_2std', 'close_w20_ma_up_2std', 'close_w20_ma_low_3std', 'close_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w20_ma_low_2std', 'tmos_close_w20_ma_up_2std', 'tmos_close_w20_ma_low_3std', 'tmos_close_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_w30_ma_low_2std', 'close_w30_ma_up_2std', 'close_w30_ma_low_3std', 'close_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w30_ma_low_2std', 'tmos_close_w30_ma_up_2std', 'tmos_close_w30_ma_low_3std', 'tmos_close_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_w60_ma_low_2std', 'close_w60_ma_up_2std', 'close_w60_ma_low_3std', 'close_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w60_ma_low_2std', 'tmos_close_w60_ma_up_2std', 'tmos_close_w60_ma_low_3std', 'tmos_close_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_w120_ma_low_2std', 'close_w120_ma_up_2std', 'close_w120_ma_low_3std', 'close_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w120_ma_low_2std', 'tmos_close_w120_ma_up_2std', 'tmos_close_w120_ma_low_3std', 'tmos_close_w120_ma_up_3std']},


        #w5
        ['close_w5_min', 'close_w5_max'],
        ['volume_w5_min', 'volume_w5_max'],
        ['tmos_close_w5_min', 'tmos_close_w5_max'],
        #w10
        ['close_w10_min', 'close_w10_max'],
        ['volume_w10_min', 'volume_w10_max'],
        ['tmos_close_w10_min', 'tmos_close_w10_max'],
        #w20
        ['close_w20_min', 'close_w20_max'],
        ['tmos_close_w20_min', 'tmos_close_w20_max'],
        #w30
        ['close_w30_min', 'close_w30_max'],
        ['tmos_close_w30_min', 'tmos_close_w30_max'],
        #w60
        ['close_w60_min', 'close_w60_max'],
        ['tmos_close_w60_min', 'tmos_close_w60_max'],
        #w120
        ['close_w120_min', 'close_w120_max'],
        ['tmos_close_w120_min', 'tmos_close_w120_max'],
]

In [67]:
groups_1hour = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1hour_w1_roc', 'close_1hour_w5_alpha', 'close_1hour_w10_alpha', 'close_1hour_w20_alpha', 'close_1hour_w30_alpha', 'close_1hour_w60_alpha', 'close_1hour_w120_alpha'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_alpha', 'volume_1hour_w10_alpha'],
         ['tmos_close_1hour_w1_roc', 'tmos_close_1hour_w5_alpha', 'tmos_close_1hour_w10_alpha', 'tmos_close_1hour_w20_alpha', 'tmos_close_1hour_w30_alpha', 'tmos_close_1hour_w60_alpha', 'tmos_close_1hour_w120_alpha'],
    
         ['close_1hour_w1_roc', 'close_1hour_w5_roc', 'close_1hour_w10_roc', 'close_1hour_w20_roc', 'close_1hour_w30_roc', 'close_1hour_w60_roc', 'close_1hour_w120_roc'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_roc', 'volume_1hour_w10_roc'],
         ['tmos_close_1hour_w1_roc', 'tmos_close_1hour_w5_roc', 'tmos_close_1hour_w10_roc', 'tmos_close_1hour_w20_roc', 'tmos_close_1hour_w30_roc', 'tmos_close_1hour_w60_roc', 'tmos_close_1hour_w120_roc'],

         ['close_1hour_w5_mean_abs_pct', 'close_1hour_w10_mean_abs_pct', 'close_1hour_w20_mean_abs_pct', 'close_1hour_w30_mean_abs_pct', 'close_1hour_w60_mean_abs_pct', 'close_1hour_w120_mean_abs_pct'],
         ['volume_1hour_w5_mean_abs_pct', 'volume_1hour_w10_mean_abs_pct'],
         ['tmos_close_1hour_w5_mean_abs_pct', 'tmos_close_1hour_w10_mean_abs_pct', 'tmos_close_1hour_w20_mean_abs_pct', 'tmos_close_1hour_w30_mean_abs_pct', 'tmos_close_1hour_w60_mean_abs_pct', 'tmos_close_1hour_w120_mean_abs_pct'],


         ['close_1hour_w5_std', 'close_1hour_w10_std', 'close_1hour_w20_std', 'close_1hour_w30_std', 'close_1hour_w60_std', 'close_1hour_w120_std'],
         ['volume_1hour_w5_std', 'volume_1hour_w10_std'],
         ['tmos_close_1hour_w5_std', 'tmos_close_1hour_w10_std', 'tmos_close_1hour_w20_std', 'tmos_close_1hour_w30_std', 'tmos_close_1hour_w60_std', 'tmos_close_1hour_w120_std'],


         ['close_1hour_w5_norm_std', 'close_1hour_w10_norm_std', 'close_1hour_w20_norm_std', 'close_1hour_w30_norm_std', 'close_1hour_w60_norm_std', 'close_1hour_w120_norm_std'],
         ['volume_1hour_w5_norm_std', 'volume_1hour_w10_norm_std'],
         ['tmos_close_1hour_w5_norm_std', 'tmos_close_1hour_w10_norm_std', 'tmos_close_1hour_w20_norm_std', 'tmos_close_1hour_w30_norm_std', 'tmos_close_1hour_w60_norm_std', 'tmos_close_1hour_w120_norm_std'],


         ['close_1hour_w5_rsi', 'close_1hour_w10_rsi', 'close_1hour_w20_rsi', 'close_1hour_w30_rsi', 'close_1hour_w60_rsi', 'close_1hour_w120_rsi'],
         ['volume_1hour_w5_rsi', 'volume_1hour_w10_rsi'],
         ['tmos_close_1hour_w5_rsi', 'tmos_close_1hour_w10_rsi', 'tmos_close_1hour_w20_rsi', 'tmos_close_1hour_w30_rsi', 'tmos_close_1hour_w60_rsi', 'tmos_close_1hour_w120_rsi'],


         ['close', 'close_1hour_w5_ma', 'close_1hour_w10_ma', 'close_1hour_w20_ma', 'close_1hour_w30_ma', 'close_1hour_w60_ma', 'close_1hour_w120_ma'],
         ['volume', 'volume_1hour_w5_ma', 'volume_1hour_w10_ma'],
         ['tmos_close', 'tmos_close_1hour_w5_ma', 'tmos_close_1hour_w10_ma', 'tmos_close_1hour_w20_ma', 'tmos_close_1hour_w30_ma', 'tmos_close_1hour_w60_ma', 'tmos_close_1hour_w120_ma'],


         ['close', 'close_1hour_w5_expma', 'close_1hour_w10_expma', 'close_1hour_w20_expma', 'close_1hour_w30_expma', 'close_1hour_w60_expma', 'close_1hour_w120_expma'],
         ['volume', 'volume_1hour_w5_expma', 'volume_1hour_w10_expma'],
         ['tmos_close', 'tmos_close_1hour_w5_expma', 'tmos_close_1hour_w10_expma', 'tmos_close_1hour_w20_expma', 'tmos_close_1hour_w30_expma', 'tmos_close_1hour_w60_expma', 'tmos_close_1hour_w120_expma'],


         {'close' : ['close_1hour_w5_min', 'close_1hour_w10_min', 'close_1hour_w20_min', 'close_1hour_w30_min', 'close_1hour_w60_min', 'close_1hour_w120_min']},
         {'volume' : ['volume_1hour_w5_min', 'volume_1hour_w10_min']},
         {'tmos_close' : ['tmos_close_1hour_w5_min', 'tmos_close_1hour_w10_min', 'tmos_close_1hour_w20_min', 'tmos_close_1hour_w30_min', 'tmos_close_1hour_w60_min', 'tmos_close_1hour_w120_min']},

    
         {'close' : ['close_1hour_w5_max', 'close_1hour_w10_max', 'close_1hour_w20_max', 'close_1hour_w30_max', 'close_1hour_w60_max', 'close_1hour_w120_max']},
         {'volume' : ['volume_1hour_w5_max', 'volume_1hour_w10_max']},
         {'tmos_close' : ['tmos_close_1hour_w5_max', 'tmos_close_1hour_w10_max', 'tmos_close_1hour_w20_max', 'tmos_close_1hour_w30_max', 'tmos_close_1hour_w60_max', 'tmos_close_1hour_w120_max']},



        #w5
         {'close' : [  'close_1hour_w5_ma_low_2std', 'close_1hour_w5_ma_up_2std', 'close_1hour_w5_ma_low_3std', 'close_1hour_w5_ma_up_3std']},
         {'volume' : [  'volume_1hour_w5_ma_low_2std', 'volume_1hour_w5_ma_up_2std', 'volume_1hour_w5_ma_low_3std', 'volume_1hour_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w5_ma_low_2std', 'tmos_close_1hour_w5_ma_up_2std', 'tmos_close_1hour_w5_ma_low_3std', 'tmos_close_1hour_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1hour_w10_ma_low_2std', 'close_1hour_w10_ma_up_2std', 'close_1hour_w10_ma_low_3std', 'close_1hour_w10_ma_up_3std']},
         {'volume' : [  'volume_1hour_w10_ma_low_2std', 'volume_1hour_w10_ma_up_2std', 'volume_1hour_w10_ma_low_3std', 'volume_1hour_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w10_ma_low_2std', 'tmos_close_1hour_w10_ma_up_2std', 'tmos_close_1hour_w10_ma_low_3std', 'tmos_close_1hour_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1hour_w20_ma_low_2std', 'close_1hour_w20_ma_up_2std', 'close_1hour_w20_ma_low_3std', 'close_1hour_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w20_ma_low_2std', 'tmos_close_1hour_w20_ma_up_2std', 'tmos_close_1hour_w20_ma_low_3std', 'tmos_close_1hour_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1hour_w30_ma_low_2std', 'close_1hour_w30_ma_up_2std', 'close_1hour_w30_ma_low_3std', 'close_1hour_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w30_ma_low_2std', 'tmos_close_1hour_w30_ma_up_2std', 'tmos_close_1hour_w30_ma_low_3std', 'tmos_close_1hour_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1hour_w60_ma_low_2std', 'close_1hour_w60_ma_up_2std', 'close_1hour_w60_ma_low_3std', 'close_1hour_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w60_ma_low_2std', 'tmos_close_1hour_w60_ma_up_2std', 'tmos_close_1hour_w60_ma_low_3std', 'tmos_close_1hour_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1hour_w120_ma_low_2std', 'close_1hour_w120_ma_up_2std', 'close_1hour_w120_ma_low_3std', 'close_1hour_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1hour_w120_ma_low_2std', 'tmos_close_1hour_w120_ma_up_2std', 'tmos_close_1hour_w120_ma_low_3std', 'tmos_close_1hour_w120_ma_up_3std']},


        #w5
        ['close_1hour_w5_min', 'close_1hour_w5_max'],
        ['volume_1hour_w5_min', 'volume_1hour_w5_max'],
        ['tmos_close_1hour_w5_min', 'tmos_close_1hour_w5_max'],
        #w10
        ['close_1hour_w10_min', 'close_1hour_w10_max'],
        ['volume_1hour_w10_min', 'volume_1hour_w10_max'],
        ['tmos_close_1hour_w10_min', 'tmos_close_1hour_w10_max'],
        #w20
        ['close_1hour_w20_min', 'close_1hour_w20_max'],
        ['tmos_close_1hour_w20_min', 'tmos_close_1hour_w20_max'],
        #w30
        ['close_1hour_w30_min', 'close_1hour_w30_max'],
        ['tmos_close_1hour_w30_min', 'tmos_close_1hour_w30_max'],
        #w60
        ['close_1hour_w60_min', 'close_1hour_w60_max'],
        ['tmos_close_1hour_w60_min', 'tmos_close_1hour_w60_max'],
        #w120
        ['close_1hour_w120_min', 'close_1hour_w120_max'],
        ['tmos_close_1hour_w120_min', 'tmos_close_1hour_w120_max'],
]

In [68]:
groups_1day= [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1day_w1_roc', 'close_1day_w5_alpha', 'close_1day_w10_alpha', 'close_1day_w20_alpha', 'close_1day_w30_alpha', 'close_1day_w60_alpha', 'close_1day_w120_alpha'],
         ['volume_1day_w1_roc', 'volume_1day_w5_alpha', 'volume_1day_w10_alpha'],
         ['tmos_close_1day_w1_roc', 'tmos_close_1day_w5_alpha', 'tmos_close_1day_w10_alpha', 'tmos_close_1day_w20_alpha', 'tmos_close_1day_w30_alpha', 'tmos_close_1day_w60_alpha', 'tmos_close_1day_w120_alpha'],
    
         ['close_1day_w1_roc', 'close_1day_w5_roc', 'close_1day_w10_roc', 'close_1day_w20_roc', 'close_1day_w30_roc', 'close_1day_w60_roc', 'close_1day_w120_roc'],
         ['volume_1day_w1_roc', 'volume_1day_w5_roc', 'volume_1day_w10_roc'],
         ['tmos_close_1day_w1_roc', 'tmos_close_1day_w5_roc', 'tmos_close_1day_w10_roc', 'tmos_close_1day_w20_roc', 'tmos_close_1day_w30_roc', 'tmos_close_1day_w60_roc', 'tmos_close_1day_w120_roc'],

         ['close_1day_w5_mean_abs_pct', 'close_1day_w10_mean_abs_pct', 'close_1day_w20_mean_abs_pct', 'close_1day_w30_mean_abs_pct', 'close_1day_w60_mean_abs_pct', 'close_1day_w120_mean_abs_pct'],
         ['volume_1day_w5_mean_abs_pct', 'volume_1day_w10_mean_abs_pct'],
         ['tmos_close_1day_w5_mean_abs_pct', 'tmos_close_1day_w10_mean_abs_pct', 'tmos_close_1day_w20_mean_abs_pct', 'tmos_close_1day_w30_mean_abs_pct', 'tmos_close_1day_w60_mean_abs_pct', 'tmos_close_1day_w120_mean_abs_pct'],


         ['close_1day_w5_std', 'close_1day_w10_std', 'close_1day_w20_std', 'close_1day_w30_std', 'close_1day_w60_std', 'close_1day_w120_std'],
         ['volume_1day_w5_std', 'volume_1day_w10_std'],
         ['tmos_close_1day_w5_std', 'tmos_close_1day_w10_std', 'tmos_close_1day_w20_std', 'tmos_close_1day_w30_std', 'tmos_close_1day_w60_std', 'tmos_close_1day_w120_std'],


         ['close_1day_w5_norm_std', 'close_1day_w10_norm_std', 'close_1day_w20_norm_std', 'close_1day_w30_norm_std', 'close_1day_w60_norm_std', 'close_1day_w120_norm_std'],
         ['volume_1day_w5_norm_std', 'volume_1day_w10_norm_std'],
         ['tmos_close_1day_w5_norm_std', 'tmos_close_1day_w10_norm_std', 'tmos_close_1day_w20_norm_std', 'tmos_close_1day_w30_norm_std', 'tmos_close_1day_w60_norm_std', 'tmos_close_1day_w120_norm_std'],


         ['close_1day_w5_rsi', 'close_1day_w10_rsi', 'close_1day_w20_rsi', 'close_1day_w30_rsi', 'close_1day_w60_rsi', 'close_1day_w120_rsi'],
         ['volume_1day_w5_rsi', 'volume_1day_w10_rsi'],
         ['tmos_close_1day_w5_rsi', 'tmos_close_1day_w10_rsi', 'tmos_close_1day_w20_rsi', 'tmos_close_1day_w30_rsi', 'tmos_close_1day_w60_rsi', 'tmos_close_1day_w120_rsi'],


         ['close', 'close_1day_w5_ma', 'close_1day_w10_ma', 'close_1day_w20_ma', 'close_1day_w30_ma', 'close_1day_w60_ma', 'close_1day_w120_ma'],
         ['volume', 'volume_1day_w5_ma', 'volume_1day_w10_ma'],
         ['tmos_close', 'tmos_close_1day_w5_ma', 'tmos_close_1day_w10_ma', 'tmos_close_1day_w20_ma', 'tmos_close_1day_w30_ma', 'tmos_close_1day_w60_ma', 'tmos_close_1day_w120_ma'],


         ['close', 'close_1day_w5_expma', 'close_1day_w10_expma', 'close_1day_w20_expma', 'close_1day_w30_expma', 'close_1day_w60_expma', 'close_1day_w120_expma'],
         ['volume', 'volume_1day_w5_expma', 'volume_1day_w10_expma',],
         ['tmos_close', 'tmos_close_1day_w5_expma', 'tmos_close_1day_w10_expma', 'tmos_close_1day_w20_expma', 'tmos_close_1day_w30_expma', 'tmos_close_1day_w60_expma', 'tmos_close_1day_w120_expma'],


         {'close' : ['close_1day_w5_min', 'close_1day_w10_min', 'close_1day_w20_min', 'close_1day_w30_min', 'close_1day_w60_min', 'close_1day_w120_min']},
         {'volume' : ['volume_1day_w5_min', 'volume_1day_w10_min',]},
         {'tmos_close' : ['tmos_close_1day_w5_min', 'tmos_close_1day_w10_min', 'tmos_close_1day_w20_min', 'tmos_close_1day_w30_min', 'tmos_close_1day_w60_min', 'tmos_close_1day_w120_min']},

    
         {'close' : ['close_1day_w5_max', 'close_1day_w10_max', 'close_1day_w20_max', 'close_1day_w30_max', 'close_1day_w60_max', 'close_1day_w120_max']},
         {'volume' : ['volume_1day_w5_max', 'volume_1day_w10_max',]},
         {'tmos_close' : ['tmos_close_1day_w5_max', 'tmos_close_1day_w10_max', 'tmos_close_1day_w20_max', 'tmos_close_1day_w30_max', 'tmos_close_1day_w60_max', 'tmos_close_1day_w120_max']},



        #w5
         {'close' : [  'close_1day_w5_ma_low_2std', 'close_1day_w5_ma_up_2std', 'close_1day_w5_ma_low_3std', 'close_1day_w5_ma_up_3std']},
         {'volume' : [  'volume_1day_w5_ma_low_2std', 'volume_1day_w5_ma_up_2std', 'volume_1day_w5_ma_low_3std', 'volume_1day_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w5_ma_low_2std', 'tmos_close_1day_w5_ma_up_2std', 'tmos_close_1day_w5_ma_low_3std', 'tmos_close_1day_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1day_w10_ma_low_2std', 'close_1day_w10_ma_up_2std', 'close_1day_w10_ma_low_3std', 'close_1day_w10_ma_up_3std']},
         {'volume' : [  'volume_1day_w10_ma_low_2std', 'volume_1day_w10_ma_up_2std', 'volume_1day_w10_ma_low_3std', 'volume_1day_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w10_ma_low_2std', 'tmos_close_1day_w10_ma_up_2std', 'tmos_close_1day_w10_ma_low_3std', 'tmos_close_1day_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1day_w20_ma_low_2std', 'close_1day_w20_ma_up_2std', 'close_1day_w20_ma_low_3std', 'close_1day_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w20_ma_low_2std', 'tmos_close_1day_w20_ma_up_2std', 'tmos_close_1day_w20_ma_low_3std', 'tmos_close_1day_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1day_w30_ma_low_2std', 'close_1day_w30_ma_up_2std', 'close_1day_w30_ma_low_3std', 'close_1day_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w30_ma_low_2std', 'tmos_close_1day_w30_ma_up_2std', 'tmos_close_1day_w30_ma_low_3std', 'tmos_close_1day_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1day_w60_ma_low_2std', 'close_1day_w60_ma_up_2std', 'close_1day_w60_ma_low_3std', 'close_1day_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w60_ma_low_2std', 'tmos_close_1day_w60_ma_up_2std', 'tmos_close_1day_w60_ma_low_3std', 'tmos_close_1day_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1day_w120_ma_low_2std', 'close_1day_w120_ma_up_2std', 'close_1day_w120_ma_low_3std', 'close_1day_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_1day_w120_ma_low_2std', 'tmos_close_1day_w120_ma_up_2std', 'tmos_close_1day_w120_ma_low_3std', 'tmos_close_1day_w120_ma_up_3std']},


        #w5
        ['close_1day_w5_min', 'close_1day_w5_max'],
        ['volume_1day_w5_min', 'volume_1day_w5_max'],
        ['tmos_close_1day_w5_min', 'tmos_close_1day_w5_max'],
        #w10
        ['close_1day_w10_min', 'close_1day_w10_max'],
        ['volume_1day_w10_min', 'volume_1day_w10_max'],
        ['tmos_close_1day_w10_min', 'tmos_close_1day_w10_max'],
        #w20
        ['close_1day_w20_min', 'close_1day_w20_max'],
        ['tmos_close_1day_w20_min', 'tmos_close_1day_w20_max'],
        #w30
        ['close_1day_w30_min', 'close_1day_w30_max'],
        ['tmos_close_1day_w30_min', 'tmos_close_1day_w30_max'],
        #w60
        ['close_1day_w60_min', 'close_1day_w60_max'],
        ['tmos_close_1day_w60_min', 'tmos_close_1day_w60_max'],
        #w120
        ['close_1day_w120_min', 'close_1day_w120_max'],
        ['tmos_close_1day_w120_min', 'tmos_close_1day_w120_max'],
]

In [76]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = (df[pair[0]] / (df[pair[1]] + np.finfo(np.float32).eps)).astype(np.float32)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = (df[pair0] / (df[pair1] + np.finfo(np.float32).eps)).astype(np.float32)
                

    return df

In [78]:
import warnings
warnings.filterwarnings('ignore')

In [81]:
print(df.shape)
calc_relative_features(df, groups_5min)
df.shape

(2514148, 767)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:02<00:00, 28.75it/s]


(2514148, 1169)

In [86]:
print(df.shape)
calc_relative_features(df, groups_1hour)
df.shape

(2514148, 1169)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:02<00:00, 26.11it/s]


(2514148, 1571)

In [89]:
print(df.shape)
calc_relative_features(df, groups_1day)
df.shape

(2514148, 1571)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:02<00:00, 23.23it/s]


(2514148, 1973)

In [92]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'

100%|████████████████████████████████████████████████████████████████████| 1973/1973 [00:15<00:00, 128.53it/s]


### Delete absolute valu columns

In [97]:
len(cols_del_5min), len(cols_del_1hour), len(cols_del_1day)

(126, 126, 126)

In [99]:
print(df.shape)
for col in cols_del_5min+cols_del_1hour+cols_del_1day:
    del df[col]
df.shape

(2514148, 1973)


(2514148, 1595)

In [105]:
# pd.set_option('display.max_rows', 2000)
# df.dtypes

## 2.6 Save data

In [107]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [109]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_5min_1hour_1day.pkl')

In [111]:
df.shape

(2514148, 1595)

In [113]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,index_1hour,index_1day,result,delta_time,...,volume_1day_w10_min/volume_1day_w10_max,tmos_close_1day_w10_min/tmos_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,tmos_close_1day_w30_min/tmos_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,tmos_close_1day_w60_min/tmos_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,tmos_close_1day_w120_min/tmos_close_1day_w120_max
0,12939,2022-12-26 10:00:00,63.02,4791.0,ABIO,4.07,1239.0,125.0,WIN,0 days 01:05:00,...,0.189064,0.972973,0.909531,0.968215,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
1,12941,2022-12-26 10:10:00,62.68,411.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:10:00,...,0.189064,0.972973,0.909531,0.968215,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
2,12942,2022-12-26 10:15:00,62.5,2205.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 01:05:00,...,0.189064,0.972973,0.909531,0.968215,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
3,12944,2022-12-26 10:25:00,62.740002,786.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:55:00,...,0.189064,0.972973,0.909531,0.968215,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
4,12945,2022-12-26 10:30:00,62.52,2166.0,ABIO,4.07,1239.0,125.0,LOSE,0 days 00:50:00,...,0.189064,0.972973,0.909531,0.968215,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101


In [None]:
#df.columns.tolist()

In [115]:
no_features = [
 'index',
 'time',
 'close',
 'volume',
 'tmos_close',

 'ticker',

 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',

 'date_hour_index',
 'time_1hour',
 'close_1hour',
 'volume_1hour',
 'tmos_close_1hour',

 'date_day_index',
 'time_1day',
 'close_1day',
 'volume_1day',
 'tmos_close_1day',]


### 2.7 Save data to NN

In [128]:
#don't use NN

In [131]:
# df_fe.head()

In [133]:
# df_1day_fe.head()

In [135]:
# df_fe.shape, df_1day_fe.shape

In [137]:
# dump_pkl(df_fe, 'data/feat_engin/lgbm/data_1hour.pkl')
# dump_pkl(df_1day_fe, 'data/feat_engin/lgbm/data_1day.pkl')

# TMP

In [328]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [330]:
df = load_pkl('data/feat_engin/lgbm/data_5min_1hour_1day.pkl')

In [None]:
df

#### uniq_1

In [117]:
no_analyze = [
 'time',
 # 'close',
 #'volume',
 #'tmos_close',

 'ticker',

 #'index_1hour',
 #'index_1day',
 'result',
 'delta_time',
 #'income_rate',
 #'res_price',
 #'res_ind',

 'time_1hour',
 #'close_1hour',
 #'volume_1hour',
 #'tmos_close_1hour',

 'time_1day',
 #'close_1day',
 #'volume_1day',
 #'tmos_close_1day',
    'date_hour_index',
    'date_day_index'
]

In [120]:
# df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].mean().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].mean()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|████████████████████████████████████████████████████████████████████| 1587/1587 [00:06<00:00, 231.24it/s]


Unnamed: 0,index,0
0,index,3711932.0
1,close,1238.373
2,volume,11552.46
3,tmos_close,5.946699
4,index_1hour,347549.6
5,index_1day,25583.1
6,income_rate,0.9993594
7,res_price,1238.633
8,res_ind,3711964.0
9,close_w1_roc,7.71343e-06


In [122]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [124]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

0
1.000036    [tmos_close_w10_ma/tmos_close_w20_ma, tmos_clo...
1.000105    [close_w20_ma/close_w60_ma, tmos_close/tmos_cl...
Name: index, dtype: object

In [126]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].values

array([array(['tmos_close_w10_ma/tmos_close_w20_ma',
              'tmos_close_w20_ma/tmos_close_w30_ma'], dtype=object),
       array(['close_w20_ma/close_w60_ma', 'tmos_close/tmos_close_w30_ma'],
             dtype=object)                                                 ],
      dtype=object)

In [None]:
#Это странно, но числа в разном порядке идут

In [134]:
# df[['tmos_close_w10_ma/tmos_close_w20_ma',
#               'tmos_close_w20_ma/tmos_close_w30_ma']]

df[['close_w20_ma/close_w60_ma', 'tmos_close/tmos_close_w30_ma']]

Unnamed: 0,close_w20_ma/close_w60_ma,tmos_close/tmos_close_w30_ma
0,0.998811,1.001066
1,1.000269,1.000902
2,1.000962,1.000820
3,1.002758,1.000656
4,1.003327,1.000574
...,...,...
2514143,1.002847,1.002111
2514144,1.002107,1.000962
2514145,1.001763,1.002747
2514146,1.001519,1.003615


In [139]:
#df[['volume/volume_1hour_w5_ma', 'volume/volume_1hour_w5_expma']]

#### uniq_2

In [145]:
#df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].nunique().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].nunique()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 1587/1587 [00:37<00:00, 41.84it/s]


Unnamed: 0,index,0
0,index,2514148
1,close,125931
2,volume,123289
3,tmos_close,328
4,index_1hour,558758
5,index_1day,41468
6,income_rate,171244
7,res_price,116639
8,res_ind,1486178
9,close_w1_roc,923605


In [147]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [149]:
pd.set_option('display.max_rows', 120)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,13,"[tmos_close_1day_w30_lvl_1.02-1.03, tmos_close..."
1,15,"[hour, tmos_close_1day_w30_lvl_1-1.01, tmos_cl..."
2,16,"[tmos_close_w30_lvl_-0.95-0.96, tmos_close_1da..."
3,17,"[close_1day_w30_lvl_1.03-1.04, tmos_close_1day..."
4,18,"[close_1day_w30_lvl_1.01-1.02, tmos_close_1day..."
5,19,"[tmos_close_1hour_w30_lvl_1.04-1.05, tmos_clos..."
6,20,"[tmos_close_w30_lvl_1.03-1.04, tmos_close_1day..."
7,21,"[tmos_close_1hour_w30_lvl_-0.95-0.96, close_1d..."
8,22,[tmos_close_1hour_w30_lvl_-0.9299999999999999-...
9,26,"[tmos_close_w30_lvl_-0.96-0.97, tmos_close_1ho..."


In [152]:
i = 43
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(1144,
 array(['tmos_close_1day_w10_rsi/tmos_close_1day_w60_rsi',
        'tmos_close_1day_w30_rsi/tmos_close_1day_w60_rsi'], dtype=object))

In [156]:
np.sort(df['close_1day_w120_mean_abs_pct'].unique())

array([0.00533912, 0.00539192, 0.00544731, ..., 0.0682639 , 0.06856481,
       0.06863439], dtype=float32)