In [1]:
#!pip3 install torch torchvision torchaudio

In [3]:
#!python3 -m pip install tensorflow

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

### 1. Load data

In [9]:

dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    

    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    




df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)


df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1hour.shape, df_1day.shape

100%|██████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 689.11it/s]


((1014350, 8), (88306, 8))

In [11]:
df_1hour.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21


### 2. Preproc data

#### 2.1 Make target

In [16]:
# Используем уже знакомую функцию :)

def get_target(df_all, ind, val_first, val_second, points_to_wait):
    #цель - достичь val_second, не достигнув val_first

    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]

    start_price = df['close'].iloc[0]
    
    if val_first < val_second:                                 
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else: 
        mask_val_first = np.array(df['close'] > val_first)      ### тут будущему мне поразбираться нужно будет
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):

        ind_end = df.shape[0]-1
        delta_time = df['time'].iloc[ind_end] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_end]
        income_rate = res_price/start_price
        return 'DNF', delta_time, income_rate, res_price, ind_end+ind
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_second]
        income_rate = res_price/start_price
        return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_first]
        income_rate = res_price/start_price
        return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_first]
            income_rate = res_price/start_price
            return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_second]
            income_rate = res_price/start_price
            return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind


def get_df_target(df, indx, percent_first=None, percent_second=None, points_to_wait=None):
    times = []
    results = []
    delta_times = []
    income_rates = []
    closes = []
    tickers = []
    res_prices = []
    res_inds = []
    
    #for ind in indx:
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time, income_rate, res_price, res_ind = get_target(df, ind, val_first, val_second, points_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        income_rates += [income_rate]
        res_prices += [res_price]
        res_inds += [res_ind]
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times,
                              'income_rate' : income_rates,
                              'res_price' : res_prices,
                              'res_ind' : res_inds
                             })
    return df_result    

In [20]:
inds = np.arange(df_1hour.shape[0])
inds.shape

(1014350,)

In [22]:
df_result = get_df_target(df_1hour, inds, percent_first=0.98, percent_second=1.04, points_to_wait=14*2)
df_result["income_rate"] = np.maximum(df_result["income_rate"], 0.98)
df_result["income_rate"] = np.minimum(df_result["income_rate"], 1.04)

100%|█████████████████████████████████████████████████████████████| 1014350/1014350 [04:05<00:00, 4127.32it/s]


In [23]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.100,LOSE,AFLT,0 days 01:00:00,0.980000,31.38,1
1,1,2022-05-04 11:00:00,31.380,LOSE,AFLT,1 days 05:00:00,0.980000,30.64,15
2,2,2022-05-04 12:00:00,31.800,LOSE,AFLT,0 days 04:00:00,0.980000,31.16,6
3,3,2022-05-04 13:00:00,31.360,LOSE,AFLT,1 days 03:00:00,0.980000,30.64,15
4,4,2022-05-04 14:00:00,31.540,LOSE,AFLT,1 days 02:00:00,0.980000,30.64,15
...,...,...,...,...,...,...,...,...,...
1014345,1014345,2025-02-04 19:00:00,11.095,DNF,RBCM,0 days 04:00:00,1.002253,11.12,1014349
1014346,1014346,2025-02-04 20:00:00,11.090,DNF,RBCM,0 days 03:00:00,1.002705,11.12,1014349
1014347,1014347,2025-02-04 21:00:00,11.105,DNF,RBCM,0 days 02:00:00,1.001351,11.12,1014349
1014348,1014348,2025-02-04 22:00:00,11.070,DNF,RBCM,0 days 01:00:00,1.004517,11.12,1014349


In [25]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.470585
LOSE    0.362207
WIN     0.167208
Name: proportion, dtype: float64

In [26]:
df_result['income_rate'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])

0.00    0.980000
0.01    0.980000
0.10    0.980000
0.20    0.980000
0.30    0.980000
0.40    0.987910
0.50    0.996739
0.60    1.003706
0.70    1.012245
0.80    1.026535
0.90    1.040000
0.99    1.040000
1.00    1.040000
Name: income_rate, dtype: float64

In [28]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [32]:
!mkdir data/feat_engin

mkdir: data/feat_engin: File exists


In [33]:
dump_pkl(df_result, './data/feat_engin/lgbm/df_result_wait_14_2days.pkl')

#### 2.1.2 Загрузим датасет с таргетами

In [37]:
df_result = load_pkl('./data/feat_engin/lgbm/df_result_wait_14_2days.pkl')

In [38]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.100,LOSE,AFLT,0 days 01:00:00,0.980000,31.38,1
1,1,2022-05-04 11:00:00,31.380,LOSE,AFLT,1 days 05:00:00,0.980000,30.64,15
2,2,2022-05-04 12:00:00,31.800,LOSE,AFLT,0 days 04:00:00,0.980000,31.16,6
3,3,2022-05-04 13:00:00,31.360,LOSE,AFLT,1 days 03:00:00,0.980000,30.64,15
4,4,2022-05-04 14:00:00,31.540,LOSE,AFLT,1 days 02:00:00,0.980000,30.64,15
...,...,...,...,...,...,...,...,...,...
1014345,1014345,2025-02-04 19:00:00,11.095,DNF,RBCM,0 days 04:00:00,1.002253,11.12,1014349
1014346,1014346,2025-02-04 20:00:00,11.090,DNF,RBCM,0 days 03:00:00,1.002705,11.12,1014349
1014347,1014347,2025-02-04 21:00:00,11.105,DNF,RBCM,0 days 02:00:00,1.001351,11.12,1014349
1014348,1014348,2025-02-04 22:00:00,11.070,DNF,RBCM,0 days 01:00:00,1.004517,11.12,1014349


In [39]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.470585
LOSE    0.362207
WIN     0.167208
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [43]:
dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")

    #time_index 1day
    df_1day['time_index'] = df_1day['time'].dt.date
    #time_index 1hour
    df_join_time = pd.DataFrame({'dt_date' : df_1hour['time'].dt.date.drop_duplicates(keep='first')})
    df_join_time['time_index'] = df_join_time['dt_date'].shift(1)
    
    df_1hour['dt_date'] = df_1hour['time'].dt.date
    df_1hour = df_1hour.merge(df_join_time, how='left', on='dt_date')
    

    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    

df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1hour.shape, df_1day.shape

100%|██████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 230.53it/s]


((1014350, 10), (88306, 9))

In [46]:
#link different time-step date


df_1day = df_1day.reset_index() #index
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['time_index', 'ticker']})


df = df_1hour.merge(df_1day, on=['time_index', 'ticker'], how='left')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): {df.loc[df['close_1day'].isnull(), ['ticker', 'time_index']].groupby(['ticker', 'time_index']).count().shape[0]}  акций-дней')

#Проверка что не наджоинилось лишнего
assert df_1hour.shape[0] == df.shape[0], 'Error: with join dimensions'


Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): 1417  акций-дней


In [47]:
#Это происходит из-за пропущенных дней в df_1day, поэтому протянем их ffil
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    cols_ffil = ['index_1day', 'close_1day', 'time_1day', 'tmos_close_1day', 'volume_1day']
    df.loc[mask_stock, cols_ffil] = df.loc[mask_stock, cols_ffil].ffill()

100%|███████████████████████████████████████████████████████████████████████| 127/127 [00:03<00:00, 35.04it/s]


In [48]:
print(f'Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): {df.loc[df['close_1day'].isnull(), ['ticker', 'time_index']].astype(str).groupby(['ticker', 'time_index']).count().shape[0]}  акций-дней')


Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): 127  акций-дней


In [49]:
df['ticker'].nunique()

127

In [51]:
#ручные проверки
df.iloc[-10035:]

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,dt_date,time_index,index_1day,time_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,tmos_close_1day
1004315,2024-04-16 15:00:00,928.500,929.000,6798.0,925.000,929.00,AQUA,7.03,2024-04-16,2024-04-15,87404.0,2024-04-15 03:00:00,924.00,921.000,117853.0,918.5,926.50,7.09
1004316,2024-04-16 16:00:00,929.000,928.500,6024.0,926.500,929.50,AQUA,7.04,2024-04-16,2024-04-15,87404.0,2024-04-15 03:00:00,924.00,921.000,117853.0,918.5,926.50,7.09
1004317,2024-04-16 17:00:00,929.000,927.500,15707.0,923.000,929.50,AQUA,7.03,2024-04-16,2024-04-15,87404.0,2024-04-15 03:00:00,924.00,921.000,117853.0,918.5,926.50,7.09
1004318,2024-04-16 18:00:00,927.500,925.000,5873.0,925.000,928.00,AQUA,7.04,2024-04-16,2024-04-15,87404.0,2024-04-15 03:00:00,924.00,921.000,117853.0,918.5,926.50,7.09
1004319,2024-04-16 19:00:00,925.000,928.500,3782.0,920.500,929.00,AQUA,7.04,2024-04-16,2024-04-15,87404.0,2024-04-15 03:00:00,924.00,921.000,117853.0,918.5,926.50,7.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014345,2025-02-04 19:00:00,11.120,11.095,50.0,11.060,11.12,RBCM,6.40,2025-02-04,2025-02-03,88304.0,2025-02-03 03:00:00,11.35,11.035,7166.0,11.0,11.35,6.45
1014346,2025-02-04 20:00:00,11.080,11.090,11.0,11.075,11.09,RBCM,6.39,2025-02-04,2025-02-03,88304.0,2025-02-03 03:00:00,11.35,11.035,7166.0,11.0,11.35,6.45
1014347,2025-02-04 21:00:00,11.095,11.105,9.0,11.095,11.14,RBCM,6.36,2025-02-04,2025-02-03,88304.0,2025-02-03 03:00:00,11.35,11.035,7166.0,11.0,11.35,6.45
1014348,2025-02-04 22:00:00,11.100,11.070,105.0,11.070,11.10,RBCM,6.38,2025-02-04,2025-02-03,88304.0,2025-02-03 03:00:00,11.35,11.035,7166.0,11.0,11.35,6.45


In [52]:
#все ок

#### 2.4 Union target and features. Make data_file to train

In [58]:
(df_result['time'] == df['time']).all(), (df_result['close'] == df['close']).all()
#прикольно вышло

(True, True)

In [59]:
(df.index == df_result.index).all()

True

In [60]:
df_result.head()

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.1,LOSE,AFLT,0 days 01:00:00,0.98,31.38,1
1,1,2022-05-04 11:00:00,31.38,LOSE,AFLT,1 days 05:00:00,0.98,30.64,15
2,2,2022-05-04 12:00:00,31.8,LOSE,AFLT,0 days 04:00:00,0.98,31.16,6
3,3,2022-05-04 13:00:00,31.36,LOSE,AFLT,1 days 03:00:00,0.98,30.64,15
4,4,2022-05-04 14:00:00,31.54,LOSE,AFLT,1 days 02:00:00,0.98,30.64,15


In [61]:
df_result.columns.tolist()

['ind',
 'time',
 'close',
 'result',
 'ticker',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind']

In [62]:
#union
df = pd.concat([df.reset_index(drop=True), df_result[['result', 'delta_time', 'income_rate', 'res_price', 'res_ind']].reset_index(drop=True)], axis=1)
df

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,dt_date,time_index,...,close_1day,volume_1day,low_1day,high_1day,tmos_close_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,32.140,32.100,503380.0,31.460,32.68,AFLT,4.27,2022-05-04,,...,,,,,,LOSE,0 days 01:00:00,0.980000,31.38,1
1,2022-05-04 11:00:00,32.100,31.380,191664.0,31.140,32.16,AFLT,4.20,2022-05-04,,...,,,,,,LOSE,1 days 05:00:00,0.980000,30.64,15
2,2022-05-04 12:00:00,31.400,31.800,89256.0,31.380,31.92,AFLT,4.22,2022-05-04,,...,,,,,,LOSE,0 days 04:00:00,0.980000,31.16,6
3,2022-05-04 13:00:00,31.800,31.360,87929.0,31.300,31.84,AFLT,4.21,2022-05-04,,...,,,,,,LOSE,1 days 03:00:00,0.980000,30.64,15
4,2022-05-04 14:00:00,31.340,31.540,69671.0,31.140,31.58,AFLT,4.21,2022-05-04,,...,,,,,,LOSE,1 days 02:00:00,0.980000,30.64,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014345,2025-02-04 19:00:00,11.120,11.095,50.0,11.060,11.12,RBCM,6.40,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 04:00:00,1.002253,11.12,1014349
1014346,2025-02-04 20:00:00,11.080,11.090,11.0,11.075,11.09,RBCM,6.39,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 03:00:00,1.002705,11.12,1014349
1014347,2025-02-04 21:00:00,11.095,11.105,9.0,11.095,11.14,RBCM,6.36,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 02:00:00,1.001351,11.12,1014349
1014348,2025-02-04 22:00:00,11.100,11.070,105.0,11.070,11.10,RBCM,6.38,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 01:00:00,1.004517,11.12,1014349


### 2.5 Feature engineering

In [67]:
df

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,dt_date,time_index,...,close_1day,volume_1day,low_1day,high_1day,tmos_close_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,32.140,32.100,503380.0,31.460,32.68,AFLT,4.27,2022-05-04,,...,,,,,,LOSE,0 days 01:00:00,0.980000,31.38,1
1,2022-05-04 11:00:00,32.100,31.380,191664.0,31.140,32.16,AFLT,4.20,2022-05-04,,...,,,,,,LOSE,1 days 05:00:00,0.980000,30.64,15
2,2022-05-04 12:00:00,31.400,31.800,89256.0,31.380,31.92,AFLT,4.22,2022-05-04,,...,,,,,,LOSE,0 days 04:00:00,0.980000,31.16,6
3,2022-05-04 13:00:00,31.800,31.360,87929.0,31.300,31.84,AFLT,4.21,2022-05-04,,...,,,,,,LOSE,1 days 03:00:00,0.980000,30.64,15
4,2022-05-04 14:00:00,31.340,31.540,69671.0,31.140,31.58,AFLT,4.21,2022-05-04,,...,,,,,,LOSE,1 days 02:00:00,0.980000,30.64,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014345,2025-02-04 19:00:00,11.120,11.095,50.0,11.060,11.12,RBCM,6.40,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 04:00:00,1.002253,11.12,1014349
1014346,2025-02-04 20:00:00,11.080,11.090,11.0,11.075,11.09,RBCM,6.39,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 03:00:00,1.002705,11.12,1014349
1014347,2025-02-04 21:00:00,11.095,11.105,9.0,11.095,11.14,RBCM,6.36,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 02:00:00,1.001351,11.12,1014349
1014348,2025-02-04 22:00:00,11.100,11.070,105.0,11.070,11.10,RBCM,6.38,2025-02-04,2025-02-03,...,11.035,7166.0,11.0,11.35,6.45,DNF,0 days 01:00:00,1.004517,11.12,1014349


In [68]:
df_1hour.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,dt_date,time_index
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,2022-05-04,
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,2022-05-04,
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,2022-05-04,
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,2022-05-04,
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,2022-05-04,


In [69]:
df_1day.head()

Unnamed: 0,index_1day,time_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,time_index
0,0,2022-05-04 03:00:00,32.12,31.2,1138130.0,31.04,32.68,AFLT,4.18,2022-05-04
1,1,2022-05-05 03:00:00,31.26,30.7,664101.0,30.4,31.74,AFLT,4.22,2022-05-05
2,2,2022-05-06 03:00:00,30.8,30.1,306164.0,30.04,30.8,AFLT,4.19,2022-05-06
3,3,2022-05-11 03:00:00,30.0,29.92,383481.0,29.62,30.54,AFLT,4.22,2022-05-11
4,4,2022-05-12 03:00:00,29.92,29.32,315954.0,29.14,30.02,AFLT,4.06,2022-05-12


In [73]:
from sklearn.linear_model import LinearRegression

def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=1).mean().values
    rolling_std = data.rolling(window=window, min_periods=1).std().values

    norm_rolling_std = rolling_std/rolling_mean

    num_of_std = 2
    lower_band_2std = rolling_mean - (rolling_std * num_of_std)
    upper_band_2std = rolling_mean + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = rolling_mean - (rolling_std * num_of_std)
    upper_band_3std = rolling_mean + (rolling_std * num_of_std)
    
    
    return rolling_mean, rolling_std, norm_rolling_std, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change."""
    roc = ((data - data.shift(periods)) / data.shift(periods))
    return roc.values



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std, norm_rolling_std,\
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    #mean_abs_pct
    mean_abs_pct = data.pct_change(periods=1).rolling(window=window, min_periods=1).apply(lambda x: x.abs().mean())
        
    #alpha
    alpha = data.rolling(window=window, min_periods=2).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0])

    #min, max
    rolling_min = data.rolling(window=window, min_periods=1).min().values
    rolling_max = data.rolling(window=window, min_periods=1).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window)
    diff = data.diff(window).values

    #можно угол угла наклона добавить, чтобы определять фазы рынка
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_ma_low_2std' : lower_band_2std,
                        f'{feat_name}_ma_up_2std' : upper_band_2std,
                        f'{feat_name}_ma_low_3std' : lower_band_3std,
                        f'{feat_name}_ma_up_3std' : upper_band_3std, 

                        f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        f'{feat_name}_diff' : diff,
                        })
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : data.pct_change(periods=1).values,
                        f'{feat_name}_diff' : data.diff(1).values,
                        })

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))})
    return df_levels


#### 1hour

In [81]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    levels =      [0, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]
    levels_tmos = [0, 0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05]

    #w1
    df_close_w1 = calc_stats_diff_1(df_ticker['close'], feat_name='close_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume'], feat_name='volume_w1')
    df_tmos_close_w1 = calc_stats_diff_1(df_ticker['tmos_close'], feat_name='tmos_close_w1')
    assert df_ticker.shape[0] == df_close_w1.shape[0] == df_volume_w1.shape[0] == df_tmos_close_w1.shape[0], 'Error w1'
    
    #w5
    df_close_w5 = calc_stats(df_ticker['close'], window=5, feat_name='close_w5')
    df_volume_w5 = calc_stats(df_ticker['volume'], window=5, feat_name='volume_w5')
    df_tmos_close_w5 = calc_stats(df_ticker['tmos_close'], window=5, feat_name='tmos_close_w5')
    assert df_ticker.shape[0] == df_close_w5.shape[0] == df_volume_w5.shape[0] == df_tmos_close_w5.shape[0], 'Error w5'
    
    #w14
    df_close_w14 = calc_stats(df_ticker['close'], window=14, feat_name='close_w14')
    df_volume_w14 = calc_stats(df_ticker['volume'], window=14, feat_name='volume_w14')
    df_tmos_close_w14 = calc_stats(df_ticker['tmos_close'], window=14, feat_name='tmos_close_w14')
    assert df_ticker.shape[0] == df_close_w14.shape[0] == df_volume_w14.shape[0] == df_tmos_close_w14.shape[0], 'Error w14'

    #w70=14*5
    df_close_w70 = calc_stats(df_ticker['close'], window=70, feat_name='close_w70')
    df_close_levels_w70 = calc_levels(df_ticker['close'], window=70, levels=levels, feat_name="close_w70")
    df_volume_w70 = calc_stats(df_ticker['volume'], window=70, feat_name='volume_w70')
    df_tmos_close_w70 = calc_stats(df_ticker['tmos_close'], window=70, feat_name='tmos_close_w70')
    df_tmos_close_levels_w70 = calc_levels(df_ticker['tmos_close'], window=70, levels=levels_tmos, feat_name='tmos_close_w70')
    assert df_ticker.shape[0] == df_close_w70.shape[0] == df_close_levels_w70.shape[0] == df_volume_w70.shape[0] == df_tmos_close_w70.shape[0] == df_tmos_close_levels_w70.shape[0], 'Error w70'


    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1, df_tmos_close_w1,
                       df_close_w5, df_volume_w5, df_tmos_close_w5,
                       df_close_w14, df_volume_w14, df_tmos_close_w14,
                       df_close_w70, df_close_levels_w70, df_volume_w70, df_tmos_close_w70, df_tmos_close_levels_w70
                      ], axis=1)]
    
    assert (df_ticker.shape[0] == dfs[-1].shape[0]) and (dfs[-1].shape[1] == (df_ticker.shape[1]+3*df_close_w1.shape[1]+9*df_close_w5.shape[1]+2*df_close_levels_w70.shape[1])), 'Error concat'

df_fe = pd.concat(dfs).set_index('index')


100%|███████████████████████████████████████████████████████████████████████| 127/127 [24:19<00:00, 11.49s/it]


In [83]:
(df_fe['close'] == df['close']).all()

True

#### 1day

In [86]:
df_1day

Unnamed: 0,index_1day,time_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,time_index
0,0,2022-05-04 03:00:00,32.120,31.200,1138130.0,31.04,32.68,AFLT,4.18,2022-05-04
1,1,2022-05-05 03:00:00,31.260,30.700,664101.0,30.40,31.74,AFLT,4.22,2022-05-05
2,2,2022-05-06 03:00:00,30.800,30.100,306164.0,30.04,30.80,AFLT,4.19,2022-05-06
3,3,2022-05-11 03:00:00,30.000,29.920,383481.0,29.62,30.54,AFLT,4.22,2022-05-11
4,4,2022-05-12 03:00:00,29.920,29.320,315954.0,29.14,30.02,AFLT,4.06,2022-05-12
...,...,...,...,...,...,...,...,...,...,...
88301,88301,2025-01-29 03:00:00,11.485,11.310,10983.0,11.22,11.51,RBCM,6.42,2025-01-29
88302,88302,2025-01-30 03:00:00,11.325,11.380,9882.0,11.31,11.52,RBCM,6.49,2025-01-30
88303,88303,2025-01-31 03:00:00,11.480,11.240,9564.0,11.15,11.48,RBCM,6.48,2025-01-31
88304,88304,2025-02-03 03:00:00,11.350,11.035,7166.0,11.00,11.35,RBCM,6.45,2025-02-03


In [90]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    levels =      [0, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]
    levels_tmos = [0, 0.005, 0.01, 0.015, 0.02, 0.03, 0.04, 0.05]

    #w1
    df_close_w1 = calc_stats_diff_1(df_ticker['close_1day'], feat_name='close_1day_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume_1day'], feat_name='volume_1day_w1')
    df_tmos_close_w1 = calc_stats_diff_1(df_ticker['tmos_close_1day'], feat_name='tmos_close_1day_w1')
    assert df_ticker.shape[0] == df_close_w1.shape[0] == df_volume_w1.shape[0] == df_tmos_close_w1.shape[0], 'Error w1'
    
    #w3
    df_close_w3 = calc_stats(df_ticker['close_1day'], window=3, feat_name='close_1day_w3')
    df_volume_w3 = calc_stats(df_ticker['volume_1day'], window=3, feat_name='volume_1day_w3')
    df_tmos_close_w3 = calc_stats(df_ticker['tmos_close_1day'], window=3, feat_name='tmos_close_1day_w3')
    assert df_ticker.shape[0] == df_close_w3.shape[0] == df_volume_w3.shape[0] == df_tmos_close_w3.shape[0], 'Error w3'
    
    #w5
    df_close_w5 = calc_stats(df_ticker['close_1day'], window=5, feat_name='close_1day_w5')
    df_volume_w5 = calc_stats(df_ticker['volume_1day'], window=5, feat_name='volume_1day_w5')
    df_tmos_close_w5 = calc_stats(df_ticker['tmos_close_1day'], window=5, feat_name='tmos_close_1day_w5')
    assert df_ticker.shape[0] == df_close_w5.shape[0] == df_volume_w5.shape[0] == df_tmos_close_w5.shape[0], 'Error w5'
    
    #w20
    df_close_w20 = calc_stats(df_ticker['close_1day'], window=5*4, feat_name='close_1day_w20')
    df_close_levels_w20 = calc_levels(df_ticker['close_1day'], window=5*4, levels=levels, feat_name='close_1day_w20')
    df_volume_w20 = calc_stats(df_ticker['volume_1day'], window=5*4, feat_name='volume_1day_w20')
    df_tmos_close_w20 = calc_stats(df_ticker['tmos_close_1day'], window=5*4, feat_name='tmos_close_1day_w20')
    df_tmos_close_levels_w20 = calc_levels(df_ticker['tmos_close_1day'], window=5*4, levels=levels_tmos, feat_name='tmos_close_1day_w20')
    assert df_ticker.shape[0] == df_close_w20.shape[0] == df_close_levels_w20.shape[0] == df_volume_w20.shape[0] == df_tmos_close_w20.shape[0] == df_tmos_close_levels_w20.shape[0], 'Error w20'
    
    
    #w100
    df_close_w100 = calc_stats(df_ticker['close_1day'], window=100, feat_name='close_1day_w100')
    df_close_levels_w100 = calc_levels(df_ticker['close_1day'], window=100, levels=levels, feat_name='close_1day_w100')
    df_volume_w100 = calc_stats(df_ticker['volume_1day'], window=100, feat_name='volume_1day_w100')
    df_tmos_close_w100 = calc_stats(df_ticker['tmos_close_1day'], window=100, feat_name='tmos_close_1day_w100')
    df_tmos_close_levels_w100 = calc_levels(df_ticker['tmos_close_1day'], window=100, levels=levels_tmos, feat_name='tmos_close_1day_w100')
    assert df_ticker.shape[0] == df_close_w100.shape[0] == df_close_levels_w100.shape[0] == df_volume_w100.shape[0] == df_tmos_close_w100.shape[0] == df_tmos_close_levels_w100.shape[0], 'Error w100'
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1, df_tmos_close_w1, 
                       df_close_w3, df_volume_w3, df_tmos_close_w3, 
                       df_close_w5, df_volume_w5, df_tmos_close_w5,
                       df_close_w20, df_close_levels_w20, df_volume_w20, df_tmos_close_w20, df_tmos_close_levels_w20,
                       df_close_w100, df_close_levels_w100, df_volume_w100, df_tmos_close_w100, df_tmos_close_levels_w100], axis=1)]
    
    assert (df_ticker.shape[0] == dfs[-1].shape[0]) and (dfs[-1].shape[1] == (df_ticker.shape[1]+3*df_close_w1.shape[1]+12*df_close_w5.shape[1]+4*df_close_levels_w20.shape[1])), 'Error concat'



df_1day_fe = pd.concat(dfs).set_index('index')


100%|███████████████████████████████████████████████████████████████████████| 127/127 [03:02<00:00,  1.44s/it]


In [92]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index == df_1day_fe['index_1day']).all()

(True, True)

In [97]:
dump_pkl(df_fe, 'df_fe.pkl')

In [98]:
dump_pkl(df_1day_fe, 'df_1day_fe.pkl')

In [133]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [135]:
df_fe = load_pkl('df_fe.pkl')
df_1day_fe = load_pkl('df_1day_fe.pkl')

In [139]:
df_1day_fe.rename(columns={'time_1day' : 'time'}, inplace=True)

### time features

In [142]:
#hour
df_fe['hour'] = df_fe['time'].dt.hour

#day
df_fe['day'] = df_fe['time'].dt.day

#day_of_week
df_fe['weekday'] = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4

#month
#df_fe['month'] = df_fe['time'].dt.month


#hour
time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
day_of_week_cyclic = np.minimum(df_1day_fe['time'].dt.dayofweek, 4) / 4
df_1day_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
df_1day_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


### resize memory

In [147]:
[elem for elem in df_fe.columns if 'ind' in elem]

['time_index', 'index_1day', 'res_ind']

In [149]:
cols = [elem for elem in df_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)


 24%|████████████████▉                                                      | 44/185 [00:00<00:00, 434.51it/s]

time
ticker
dt_date
time_1day
result
delta_time


100%|██████████████████████████████████████████████████████████████████████| 185/185 [00:00<00:00, 447.82it/s]


In [153]:
[elem for elem in df_1day_fe.columns if 'ind' in elem]

['index_1day', 'time_index']

In [155]:
cols = [elem for elem in df_1day_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

100%|█████████████████████████████████████████████████████████████████████| 242/242 [00:00<00:00, 4329.39it/s]

time
ticker





In [158]:
#df_fe.isnull().sum().sort_values()

In [160]:
#df_1day_fe.isnull().sum().sort_values()

### Delete useless (dublicated) columns


In [163]:
cols = df_fe.columns[~df_fe.columns.isin(['open', 'low', 'high', 
                    'open_1day', 'close_1day', 'volume_1day', 'low_1day', 'high_1day', 'tmos_close_1day',
                                          'time_index', 'dt_date',
                                          'time_1day'
                   ])]

df_fe = df_fe[cols]

In [166]:
cols = df_1day_fe.columns[~df_1day_fe.columns.isin(['index_1day', 'open_1day', 'low_1day', 'high_1day', #'index_start_1day',
                                                    # 'close_1day_w100_roc', 'close_1day_w100_diff',
                                                    # 'volume_1day_w100_roc', 'volume_1day_100_diff',
                                                    'time_index', 'dt_date'
                   ])]

df_1day_fe = df_1day_fe[cols]

### Absolute value columns

In [171]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False

    if col_name in ['volume', 'tmos_close', 'close_1day', 'volume_1day', 'tmos_close_1day']: #'close', 
        return True
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False

cols_del_1hour = [elem for elem in df_fe.columns if flag_delete(elem)]
len(cols_del_1hour)

86

In [173]:
cols_del_1hour

['volume',
 'tmos_close',
 'close_w1_diff',
 'volume_w1_diff',
 'tmos_close_w1_diff',
 'close_w5_ma',
 'close_w5_std',
 'close_w5_ma_low_2std',
 'close_w5_ma_up_2std',
 'close_w5_ma_low_3std',
 'close_w5_ma_up_3std',
 'close_w5_min',
 'close_w5_max',
 'close_w5_diff',
 'volume_w5_ma',
 'volume_w5_std',
 'volume_w5_ma_low_2std',
 'volume_w5_ma_up_2std',
 'volume_w5_ma_low_3std',
 'volume_w5_ma_up_3std',
 'volume_w5_min',
 'volume_w5_max',
 'volume_w5_diff',
 'tmos_close_w5_ma',
 'tmos_close_w5_std',
 'tmos_close_w5_ma_low_2std',
 'tmos_close_w5_ma_up_2std',
 'tmos_close_w5_ma_low_3std',
 'tmos_close_w5_ma_up_3std',
 'tmos_close_w5_min',
 'tmos_close_w5_max',
 'tmos_close_w5_diff',
 'close_w14_ma',
 'close_w14_std',
 'close_w14_ma_low_2std',
 'close_w14_ma_up_2std',
 'close_w14_ma_low_3std',
 'close_w14_ma_up_3std',
 'close_w14_min',
 'close_w14_max',
 'close_w14_diff',
 'volume_w14_ma',
 'volume_w14_std',
 'volume_w14_ma_low_2std',
 'volume_w14_ma_up_2std',
 'volume_w14_ma_low_3std',
 '

In [175]:
cols_del_1day = [elem for elem in df_1day_fe.columns if flag_delete(elem)]
len(cols_del_1day)

114

In [177]:
cols_del_1day

['close_1day',
 'volume_1day',
 'tmos_close_1day',
 'close_1day_w1_diff',
 'volume_1day_w1_diff',
 'tmos_close_1day_w1_diff',
 'close_1day_w3_ma',
 'close_1day_w3_std',
 'close_1day_w3_ma_low_2std',
 'close_1day_w3_ma_up_2std',
 'close_1day_w3_ma_low_3std',
 'close_1day_w3_ma_up_3std',
 'close_1day_w3_min',
 'close_1day_w3_max',
 'close_1day_w3_diff',
 'volume_1day_w3_ma',
 'volume_1day_w3_std',
 'volume_1day_w3_ma_low_2std',
 'volume_1day_w3_ma_up_2std',
 'volume_1day_w3_ma_low_3std',
 'volume_1day_w3_ma_up_3std',
 'volume_1day_w3_min',
 'volume_1day_w3_max',
 'volume_1day_w3_diff',
 'tmos_close_1day_w3_ma',
 'tmos_close_1day_w3_std',
 'tmos_close_1day_w3_ma_low_2std',
 'tmos_close_1day_w3_ma_up_2std',
 'tmos_close_1day_w3_ma_low_3std',
 'tmos_close_1day_w3_ma_up_3std',
 'tmos_close_1day_w3_min',
 'tmos_close_1day_w3_max',
 'tmos_close_1day_w3_diff',
 'close_1day_w5_ma',
 'close_1day_w5_std',
 'close_1day_w5_ma_low_2std',
 'close_1day_w5_ma_up_2std',
 'close_1day_w5_ma_low_3std',
 'cl

### Relative features

In [183]:
groups_1hour = [
        #1hour: w1, #w5, #w14, #w70
         ['close_w1_roc', 'close_w5_alpha', 'close_w14_alpha', 'close_w70_alpha'],
         ['volume_w1_roc', 'volume_w5_alpha', 'volume_w14_alpha', 'volume_w70_alpha'],
         ['tmos_close_w1_roc', 'tmos_close_w5_alpha', 'tmos_close_w14_alpha', 'tmos_close_w70_alpha'],

         ['close_w1_roc', 'close_w5_roc', 'close_w14_roc', 'close_w70_roc'],
         ['volume_w1_roc', 'volume_w5_roc', 'volume_w14_roc', 'volume_w70_roc'],
         ['tmos_close_w1_roc', 'tmos_close_w5_roc', 'tmos_close_w14_roc', 'tmos_close_w70_roc'],

         ['close_w5_mean_abs_pct', 'close_w14_mean_abs_pct', 'close_w70_mean_abs_pct'],
         ['volume_w5_mean_abs_pct', 'volume_w14_mean_abs_pct', 'volume_w70_mean_abs_pct'],
         ['tmos_close_w5_mean_abs_pct', 'tmos_close_w14_mean_abs_pct', 'tmos_close_w70_mean_abs_pct'],

         ['close_w5_std', 'close_w14_std', 'close_w70_std'],
         ['volume_w5_std', 'volume_w14_std', 'volume_w70_std'],
         ['tmos_close_w5_std', 'tmos_close_w14_std', 'tmos_close_w70_std'],

         ['close_w5_norm_std', 'close_w14_norm_std', 'close_w70_norm_std'],
         ['volume_w5_norm_std', 'volume_w14_norm_std', 'volume_w70_norm_std'],
         ['tmos_close_w5_norm_std', 'tmos_close_w14_norm_std', 'tmos_close_w70_norm_std'],

         ['close_w5_rsi', 'close_w14_rsi', 'close_w70_rsi'],
         ['volume_w5_rsi', 'volume_w14_rsi', 'volume_w70_rsi'],
         ['tmos_close_w5_rsi', 'tmos_close_w14_rsi', 'tmos_close_w70_rsi'],
        
         ['close', 'close_w5_ma', 'close_w14_ma', 'close_w70_ma'],
         ['volume', 'volume_w5_ma', 'volume_w14_ma', 'volume_w70_ma'],
         ['tmos_close', 'tmos_close_w5_ma', 'tmos_close_w14_ma', 'tmos_close_w70_ma'],

         {'close' : ['close_w5_min', 'close_w14_min', 'close_w70_min']},
         {'volume' : ['volume_w5_min', 'volume_w14_min', 'volume_w70_min']},
         {'tmos_close' : ['tmos_close_w5_min', 'tmos_close_w14_min', 'tmos_close_w70_min']},
    
         {'close' : ['close_w5_max', 'close_w14_max', 'close_w70_max']},
         {'volume' : ['volume_w5_max', 'volume_w14_max', 'volume_w70_max']},
         {'tmos_close' : ['tmos_close_w5_max', 'tmos_close_w14_max', 'tmos_close_w70_max']},


        #w5
         {'close' : [  'close_w5_ma_low_2std', 'close_w5_ma_up_2std', 'close_w5_ma_low_3std', 'close_w5_ma_up_3std']},
         {'volume' : [  'volume_w5_ma_low_2std', 'volume_w5_ma_up_2std', 'volume_w5_ma_low_3std', 'volume_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w5_ma_low_2std', 'tmos_close_w5_ma_up_2std', 'tmos_close_w5_ma_low_3std', 'tmos_close_w5_ma_up_3std']},
        #w14
         {'close' : [ 'close_w14_ma_low_2std', 'close_w14_ma_up_2std', 'close_w14_ma_low_3std', 'close_w14_ma_up_3std']},
         {'volume' : [ 'volume_w14_ma_low_2std', 'volume_w14_ma_up_2std', 'volume_w14_ma_low_3std', 'volume_w14_ma_up_3std']},
         {'tmos_close' : [ 'tmos_close_w14_ma_low_2std', 'tmos_close_w14_ma_up_2std', 'tmos_close_w14_ma_low_3std', 'tmos_close_w14_ma_up_3std']},
        #w70
         {'close' : [ 'close_w70_ma_low_2std', 'close_w70_ma_up_2std', 'close_w70_ma_low_3std', 'close_w70_ma_up_3std']},
         {'volume' : [ 'volume_w70_ma_low_2std', 'volume_w70_ma_up_2std', 'volume_w70_ma_low_3std', 'volume_w70_ma_up_3std']},
         {'tmos_close' : [ 'tmos_close_w70_ma_low_2std', 'tmos_close_w70_ma_up_2std', 'tmos_close_w70_ma_low_3std', 'tmos_close_w70_ma_up_3std']},


    #comment
        #w5
        ['close_w5_min', 'close_w5_max'],
        ['volume_w5_min', 'volume_w5_max'],
        ['tmos_close_w5_min', 'tmos_close_w5_max'],
        #w14
        ['close_w14_min', 'close_w14_max'],
        ['volume_w14_min', 'volume_w14_max'],
        ['tmos_close_w14_min', 'tmos_close_w14_max'],
        #w70
        ['close_w70_min', 'close_w70_max'],
        ['volume_w70_min', 'volume_w70_max'],
        ['tmos_close_w70_min', 'tmos_close_w70_max'],
]

In [186]:
groups_1day = [
        #1day: #w1, #w3, #w5, #w20, #w100
         ['close_1day_w1_roc', 'close_1day_w3_alpha', 'close_1day_w5_alpha', 'close_1day_w20_alpha', 'close_1day_w100_alpha'],
         ['volume_1day_w1_roc','volume_1day_w3_alpha', 'volume_1day_w5_alpha', 'volume_1day_w20_alpha', 'volume_1day_w100_alpha'],
         ['tmos_close_1day_w1_roc', 'tmos_close_1day_w3_alpha', 'tmos_close_1day_w5_alpha', 'tmos_close_1day_w20_alpha', 'tmos_close_1day_w100_alpha'],

         ['close_1day_w1_roc', 'close_1day_w3_roc', 'close_1day_w5_roc', 'close_1day_w20_roc', 'close_1day_w100_roc'],
         ['volume_1day_w1_roc', 'volume_1day_w3_roc', 'volume_1day_w5_roc', 'volume_1day_w20_roc', 'volume_1day_w100_roc'],
         ['tmos_close_1day_w1_roc','tmos_close_1day_w3_roc', 'tmos_close_1day_w5_roc', 'tmos_close_1day_w20_roc', 'tmos_close_1day_w100_roc'],

         ['close_1day_w3_mean_abs_pct', 'close_1day_w5_mean_abs_pct', 'close_1day_w20_mean_abs_pct', 'close_1day_w100_mean_abs_pct'],
         ['volume_1day_w3_mean_abs_pct','volume_1day_w5_mean_abs_pct', 'volume_1day_w20_mean_abs_pct', 'volume_1day_w100_mean_abs_pct'],
         ['tmos_close_1day_w3_mean_abs_pct', 'tmos_close_1day_w5_mean_abs_pct', 'tmos_close_1day_w20_mean_abs_pct', 'tmos_close_1day_w100_mean_abs_pct'],

         ['close_1day_w3_std', 'close_1day_w5_std', 'close_1day_w20_std', 'close_1day_w100_std'],
         ['volume_1day_w3_std', 'volume_1day_w5_std', 'volume_1day_w20_std', 'volume_1day_w100_std'],
         ['tmos_close_1day_w3_std', 'tmos_close_1day_w5_std', 'tmos_close_1day_w20_std', 'tmos_close_1day_w100_std'],

         ['close_1day_w3_norm_std', 'close_1day_w5_norm_std', 'close_1day_w20_norm_std', 'close_1day_w100_norm_std'],
         ['volume_1day_w3_norm_std', 'volume_1day_w5_norm_std', 'volume_1day_w20_norm_std', 'volume_1day_w100_norm_std'],
         ['tmos_close_1day_w3_norm_std', 'tmos_close_1day_w5_norm_std', 'tmos_close_1day_w20_norm_std', 'tmos_close_1day_w100_norm_std'],

         ['close_1day_w3_rsi',  'close_1day_w5_rsi', 'close_1day_w20_rsi', 'close_1day_w100_rsi'],
         ['volume_1day_w3_rsi', 'volume_1day_w5_rsi', 'volume_1day_w20_rsi', 'volume_1day_w100_rsi'],
         ['tmos_close_1day_w3_rsi', 'tmos_close_1day_w5_rsi', 'tmos_close_1day_w20_rsi', 'tmos_close_1day_w100_rsi'],
        
         ['close_1day','close_1day_w3_ma', 'close_1day_w5_ma', 'close_1day_w20_ma', 'close_1day_w100_ma'],
         ['volume_1day', 'volume_1day_w3_ma', 'volume_1day_w5_ma', 'volume_1day_w20_ma', 'volume_1day_w100_ma'],
         ['tmos_close_1day','tmos_close_1day_w3_ma', 'tmos_close_1day_w5_ma', 'tmos_close_1day_w20_ma', 'tmos_close_1day_w100_ma'],

         {'close_1day' : ['close_1day_w3_min', 'close_1day_w5_min', 'close_1day_w20_min', 'close_1day_w100_min']},
         {'volume_1day' : ['volume_1day_w3_min', 'volume_1day_w5_min', 'volume_1day_w20_min', 'volume_1day_w100_min']},
         {'tmos_close_1day' : ['tmos_close_1day_w3_min', 'tmos_close_1day_w5_min', 'tmos_close_1day_w20_min', 'tmos_close_1day_w100_min']},
    
         {'close_1day' : ['close_1day_w3_max', 'close_1day_w5_max', 'close_1day_w20_max', 'close_1day_w100_max']},
         {'volume_1day' : ['volume_1day_w3_max', 'volume_1day_w5_max', 'volume_1day_w20_max', 'volume_1day_w100_max']},
         {'tmos_close_1day' : ['tmos_close_1day_w3_max','tmos_close_1day_w5_max', 'tmos_close_1day_w20_max', 'tmos_close_1day_w100_max']},

        #w3
         {'close_1day' : [  'close_1day_w3_ma_low_2std', 'close_1day_w3_ma_up_2std', 'close_1day_w3_ma_low_3std', 'close_1day_w3_ma_up_3std']},
         {'volume_1day' : [  'volume_1day_w3_ma_low_2std', 'volume_1day_w3_ma_up_2std', 'volume_1day_w3_ma_low_3std', 'volume_1day_w3_ma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w3_ma_low_2std', 'tmos_close_1day_w3_ma_up_2std', 'tmos_close_1day_w3_ma_low_3std', 'tmos_close_1day_w3_ma_up_3std']},
        #w5
         {'close_1day' : [  'close_1day_w5_ma_low_2std', 'close_1day_w5_ma_up_2std', 'close_1day_w5_ma_low_3std', 'close_1day_w5_ma_up_3std']},
         {'volume_1day' : [  'volume_1day_w5_ma_low_2std', 'volume_1day_w5_ma_up_2std', 'volume_1day_w5_ma_low_3std', 'volume_1day_w5_ma_up_3std']},
         {'tmos_close_1day' : [ 'tmos_close_1day_w5_ma_low_2std', 'tmos_close_1day_w5_ma_up_2std', 'tmos_close_1day_w5_ma_low_3std', 'tmos_close_1day_w5_ma_up_3std']},
        #w20
         {'close_1day' : [ 'close_1day_w20_ma_low_2std', 'close_1day_w20_ma_up_2std', 'close_1day_w20_ma_low_3std', 'close_1day_w20_ma_up_3std']},
         {'volume_1day' : [ 'volume_1day_w20_ma_low_2std', 'volume_1day_w20_ma_up_2std', 'volume_1day_w20_ma_low_3std', 'volume_1day_w20_ma_up_3std']},
         {'tmos_close_1day' : [ 'tmos_close_1day_w20_ma_low_2std', 'tmos_close_1day_w20_ma_up_2std', 'tmos_close_1day_w20_ma_low_3std', 'tmos_close_1day_w20_ma_up_3std']},
        #w100
         {'close_1day' : [ 'close_1day_w100_ma_low_2std', 'close_1day_w100_ma_up_2std', 'close_1day_w100_ma_low_3std', 'close_1day_w100_ma_up_3std']},
         {'volume_1day' : [ 'volume_1day_w100_ma_low_2std', 'volume_1day_w100_ma_up_2std', 'volume_1day_w100_ma_low_3std', 'volume_1day_w100_ma_up_3std']},
         {'tmos_close_1day' : ['tmos_close_1day_w100_ma_low_2std', 'tmos_close_1day_w100_ma_up_2std', 'tmos_close_1day_w100_ma_low_3std', 'tmos_close_1day_w100_ma_up_3std']},

    #comment
        #w3
        ['close_1day_w3_min', 'close_1day_w3_max'],
        ['volume_1day_w3_min', 'volume_1day_w3_max'],
        ['tmos_close_1day_w3_min', 'tmos_close_1day_w3_max'],    
        #w5
        ['close_1day_w5_min', 'close_1day_w5_max'],
        ['volume_1day_w5_min', 'volume_1day_w5_max'],
        ['tmos_close_1day_w5_min', 'tmos_close_1day_w5_max'],
    
        #w20
        ['close_1day_w20_min', 'close_1day_w20_max'],
        ['volume_1day_w20_min', 'volume_1day_w20_max'],
        ['tmos_close_1day_w20_min', 'tmos_close_1day_w20_max'],
    
        # #w100 - вот эти фичи бы удалить - это скорее лик
        # ['close_1day_w100_min', 'close_1day_w100_max'],
        # ['volume_1day_w100_min', 'volume_1day_w100_max'],
        # ['tmos_close_1day_w100_min', 'tmos_close_1day_w100_max'],
]


In [189]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = df[pair[0]] / (df[pair[1]] + np.finfo(np.float32).eps)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = df[pair0] / (df[pair1] + np.finfo(np.float32).eps)

    return df

In [191]:
import warnings
warnings.filterwarnings('ignore')

In [194]:
print(df_fe.shape)
calc_relative_features(df_fe, groups_1hour)
df_fe.shape

(1014350, 176)


100%|████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 195.51it/s]


(1014350, 329)

In [196]:
print(df_1day_fe.shape)
calc_relative_features(df_1day_fe, groups_1day)
df_1day_fe.shape

(88306, 239)


100%|████████████████████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 809.58it/s]


(88306, 482)

### Delete absolute valu columns

In [201]:
len(cols_del_1hour)

86

In [203]:
print(df_fe.shape)
for col in cols_del_1hour:
    del df_fe[col]
df_fe.shape

(1014350, 329)


(1014350, 243)

In [206]:
len(cols_del_1day)

114

In [208]:
print(df_1day_fe.shape)
for col in cols_del_1day:
    del df_1day_fe[col]
df_1day_fe.shape

(88306, 482)


(88306, 368)

### Concat

In [213]:
df_fe.columns.tolist()

['time',
 'close',
 'ticker',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'tmos_close_w1_roc',
 'close_w5_norm_std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_rsi',
 'close_w5_roc',
 'volume_w5_norm_std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_rsi',
 'volume_w5_roc',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_mean_abs_pct',
 'tmos_close_w5_alpha',
 'tmos_close_w5_rsi',
 'tmos_close_w5_roc',
 'close_w14_norm_std',
 'close_w14_mean_abs_pct',
 'close_w14_alpha',
 'close_w14_rsi',
 'close_w14_roc',
 'volume_w14_norm_std',
 'volume_w14_mean_abs_pct',
 'volume_w14_alpha',
 'volume_w14_rsi',
 'volume_w14_roc',
 'tmos_close_w14_norm_std',
 'tmos_close_w14_mean_abs_pct',
 'tmos_close_w14_alpha',
 'tmos_close_w14_rsi',
 'tmos_close_w14_roc',
 'close_w70_norm_std',
 'close_w70_mean_abs_pct',
 'close_w70_alpha',
 'close_w70_rsi',
 'close_w70_roc',
 'close_w70_lvl_1-1.005',
 'close_w70_lvl_-

In [215]:
# del df_fe['index_start'], df_fe['index_start_1day']

In [217]:
df_fe.columns.tolist()

['time',
 'close',
 'ticker',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'tmos_close_w1_roc',
 'close_w5_norm_std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_rsi',
 'close_w5_roc',
 'volume_w5_norm_std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_rsi',
 'volume_w5_roc',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_mean_abs_pct',
 'tmos_close_w5_alpha',
 'tmos_close_w5_rsi',
 'tmos_close_w5_roc',
 'close_w14_norm_std',
 'close_w14_mean_abs_pct',
 'close_w14_alpha',
 'close_w14_rsi',
 'close_w14_roc',
 'volume_w14_norm_std',
 'volume_w14_mean_abs_pct',
 'volume_w14_alpha',
 'volume_w14_rsi',
 'volume_w14_roc',
 'tmos_close_w14_norm_std',
 'tmos_close_w14_mean_abs_pct',
 'tmos_close_w14_alpha',
 'tmos_close_w14_rsi',
 'tmos_close_w14_roc',
 'close_w70_norm_std',
 'close_w70_mean_abs_pct',
 'close_w70_alpha',
 'close_w70_rsi',
 'close_w70_roc',
 'close_w70_lvl_1-1.005',
 'close_w70_lvl_-

In [220]:
df_1day_fe.columns.tolist()

['time',
 'ticker',
 'close_1day_w1_roc',
 'volume_1day_w1_roc',
 'tmos_close_1day_w1_roc',
 'close_1day_w3_norm_std',
 'close_1day_w3_mean_abs_pct',
 'close_1day_w3_alpha',
 'close_1day_w3_rsi',
 'close_1day_w3_roc',
 'volume_1day_w3_norm_std',
 'volume_1day_w3_mean_abs_pct',
 'volume_1day_w3_alpha',
 'volume_1day_w3_rsi',
 'volume_1day_w3_roc',
 'tmos_close_1day_w3_norm_std',
 'tmos_close_1day_w3_mean_abs_pct',
 'tmos_close_1day_w3_alpha',
 'tmos_close_1day_w3_rsi',
 'tmos_close_1day_w3_roc',
 'close_1day_w5_norm_std',
 'close_1day_w5_mean_abs_pct',
 'close_1day_w5_alpha',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'volume_1day_w5_norm_std',
 'volume_1day_w5_mean_abs_pct',
 'volume_1day_w5_alpha',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'tmos_close_1day_w5_norm_std',
 'tmos_close_1day_w5_mean_abs_pct',
 'tmos_close_1day_w5_alpha',
 'tmos_close_1day_w5_rsi',
 'tmos_close_1day_w5_roc',
 'close_1day_w20_norm_std',
 'close_1day_w20_mean_abs_pct',
 'close_1day_w20_alpha',
 'close

In [222]:
#del df_1day_fe['time'], df_1day_fe['ticker']

In [224]:
df_1day_fe.columns.tolist()

['time',
 'ticker',
 'close_1day_w1_roc',
 'volume_1day_w1_roc',
 'tmos_close_1day_w1_roc',
 'close_1day_w3_norm_std',
 'close_1day_w3_mean_abs_pct',
 'close_1day_w3_alpha',
 'close_1day_w3_rsi',
 'close_1day_w3_roc',
 'volume_1day_w3_norm_std',
 'volume_1day_w3_mean_abs_pct',
 'volume_1day_w3_alpha',
 'volume_1day_w3_rsi',
 'volume_1day_w3_roc',
 'tmos_close_1day_w3_norm_std',
 'tmos_close_1day_w3_mean_abs_pct',
 'tmos_close_1day_w3_alpha',
 'tmos_close_1day_w3_rsi',
 'tmos_close_1day_w3_roc',
 'close_1day_w5_norm_std',
 'close_1day_w5_mean_abs_pct',
 'close_1day_w5_alpha',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'volume_1day_w5_norm_std',
 'volume_1day_w5_mean_abs_pct',
 'volume_1day_w5_alpha',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'tmos_close_1day_w5_norm_std',
 'tmos_close_1day_w5_mean_abs_pct',
 'tmos_close_1day_w5_alpha',
 'tmos_close_1day_w5_rsi',
 'tmos_close_1day_w5_roc',
 'close_1day_w20_norm_std',
 'close_1day_w20_mean_abs_pct',
 'close_1day_w20_alpha',
 'close

In [227]:
df_1day_fe.reset_index(inplace=True)

In [229]:
print(df_fe.shape)
df = df_fe.merge(df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['time', 'ticker'])]], left_on='index_1day', right_on='index')
df.shape

(1014350, 243)


(1013212, 610)

In [231]:
df

Unnamed: 0,time,close,ticker,index_1day,result,delta_time,income_rate,res_price,res_ind,close_w1_roc,...,tmos_close_1day_w100_ma_up_3std/tmos_close_1day,close_1day_w3_min/close_1day_w3_max,volume_1day_w3_min/volume_1day_w3_max,tmos_close_1day_w3_min/tmos_close_1day_w3_max,close_1day_w5_min/close_1day_w5_max,volume_1day_w5_min/volume_1day_w5_max,tmos_close_1day_w5_min/tmos_close_1day_w5_max,close_1day_w20_min/close_1day_w20_max,volume_1day_w20_min/volume_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max
0,2022-05-05 10:00:00,31.240,AFLT,0.0,LOSE,1 days 00:00:00,0.980000,30.379999,18,0.001282,...,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000,1.000000,1.000000
1,2022-05-05 11:00:00,30.980,AFLT,0.0,LOSE,1 days 01:00:00,0.980000,30.280001,20,-0.008323,...,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000,1.000000,1.000000
2,2022-05-05 12:00:00,30.920,AFLT,0.0,LOSE,1 days 00:00:00,0.980000,30.280001,20,-0.001937,...,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000,1.000000,1.000000
3,2022-05-05 13:00:00,30.920,AFLT,0.0,LOSE,0 days 23:00:00,0.980000,30.280001,20,0.000000,...,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000,1.000000,1.000000
4,2022-05-05 14:00:00,31.000,AFLT,0.0,LOSE,0 days 22:00:00,0.980000,30.280001,20,0.002587,...,,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.00000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1013207,2025-02-04 19:00:00,11.095,RBCM,88304.0,DNF,0 days 04:00:00,1.002253,11.120000,1014349,-0.003145,...,1.065056,0.969684,0.725157,0.993837,0.969684,0.652463,0.987673,0.90055,0.101305,0.930663
1013208,2025-02-04 20:00:00,11.090,RBCM,88304.0,DNF,0 days 03:00:00,1.002705,11.120000,1014349,-0.000451,...,1.065056,0.969684,0.725157,0.993837,0.969684,0.652463,0.987673,0.90055,0.101305,0.930663
1013209,2025-02-04 21:00:00,11.105,RBCM,88304.0,DNF,0 days 02:00:00,1.001351,11.120000,1014349,0.001353,...,1.065056,0.969684,0.725157,0.993837,0.969684,0.652463,0.987673,0.90055,0.101305,0.930663
1013210,2025-02-04 22:00:00,11.070,RBCM,88304.0,DNF,0 days 01:00:00,1.004517,11.120000,1014349,-0.003152,...,1.065056,0.969684,0.725157,0.993837,0.969684,0.652463,0.987673,0.90055,0.101305,0.930663


In [234]:
#проверки: почему не все приджоинилось

In [236]:
df_tmp = df_fe.merge(df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['time', 'ticker'])]], how='left', left_on='index_1day', right_on='index')
df_tmp.shape

(1014350, 610)

In [238]:
df_tmp.loc[df_tmp['index'].isnull(), 'time'].dt.date.unique()

array([datetime.date(2022, 5, 4), datetime.date(2022, 12, 14)],
      dtype=object)

In [240]:
#первый день торгов, там нет инфы по торгам предыдущего дня

In [242]:
mask = df_tmp['index'].isnull() & (df_tmp['time'].dt.date == datetime.date(2022, 12, 14))

df_tmp.loc[mask, 'ticker'].unique()

array(['WUSH'], dtype=object)

In [244]:
# старт торгов акций

## 2.6 Save data

In [249]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [251]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_1hour_1day.pkl')

In [253]:
df.shape

(1013212, 610)

In [255]:
df.head()

Unnamed: 0,time,close,ticker,index_1day,result,delta_time,income_rate,res_price,res_ind,close_w1_roc,...,tmos_close_1day_w100_ma_up_3std/tmos_close_1day,close_1day_w3_min/close_1day_w3_max,volume_1day_w3_min/volume_1day_w3_max,tmos_close_1day_w3_min/tmos_close_1day_w3_max,close_1day_w5_min/close_1day_w5_max,volume_1day_w5_min/volume_1day_w5_max,tmos_close_1day_w5_min/tmos_close_1day_w5_max,close_1day_w20_min/close_1day_w20_max,volume_1day_w20_min/volume_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max
0,2022-05-05 10:00:00,31.24,AFLT,0.0,LOSE,1 days 00:00:00,0.98,30.379999,18,0.001282,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,2022-05-05 11:00:00,30.98,AFLT,0.0,LOSE,1 days 01:00:00,0.98,30.280001,20,-0.008323,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,2022-05-05 12:00:00,30.92,AFLT,0.0,LOSE,1 days 00:00:00,0.98,30.280001,20,-0.001937,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,2022-05-05 13:00:00,30.92,AFLT,0.0,LOSE,0 days 23:00:00,0.98,30.280001,20,0.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,2022-05-05 14:00:00,31.0,AFLT,0.0,LOSE,0 days 22:00:00,0.98,30.280001,20,0.002587,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [257]:
df.columns.tolist()

['time',
 'close',
 'ticker',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'tmos_close_w1_roc',
 'close_w5_norm_std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_rsi',
 'close_w5_roc',
 'volume_w5_norm_std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_rsi',
 'volume_w5_roc',
 'tmos_close_w5_norm_std',
 'tmos_close_w5_mean_abs_pct',
 'tmos_close_w5_alpha',
 'tmos_close_w5_rsi',
 'tmos_close_w5_roc',
 'close_w14_norm_std',
 'close_w14_mean_abs_pct',
 'close_w14_alpha',
 'close_w14_rsi',
 'close_w14_roc',
 'volume_w14_norm_std',
 'volume_w14_mean_abs_pct',
 'volume_w14_alpha',
 'volume_w14_rsi',
 'volume_w14_roc',
 'tmos_close_w14_norm_std',
 'tmos_close_w14_mean_abs_pct',
 'tmos_close_w14_alpha',
 'tmos_close_w14_rsi',
 'tmos_close_w14_roc',
 'close_w70_norm_std',
 'close_w70_mean_abs_pct',
 'close_w70_alpha',
 'close_w70_rsi',
 'close_w70_roc',
 'close_w70_lvl_1-1.005',
 'close_w70_lvl_-

### 2.7 Save data to NN

In [128]:
#don't use NN

In [131]:
# df_fe.head()

In [133]:
# df_1day_fe.head()

In [135]:
# df_fe.shape, df_1day_fe.shape

In [137]:
# dump_pkl(df_fe, 'data/feat_engin/lgbm/data_1hour.pkl')
# dump_pkl(df_1day_fe, 'data/feat_engin/lgbm/data_1day.pkl')

#### uniq_1

In [259]:
#проверка уникальности данных в колонках
(~df.columns.isin(['ticker', 'result'])).sum(), df[df.columns[~df.columns.isin(['ticker', 'result'])]].mean().nunique()


(608, 607)

In [261]:
df_tmp = df[df.columns[~df.columns.isin(['ticker', 'result', 'time', 'delta_time'])]].mean().reset_index()

In [262]:
df_tmp

Unnamed: 0,index,0
0,close,1451.009521
1,index_1day,44190.071565
2,income_rate,1.001827
3,res_price,1451.635132
4,res_ind,507200.738694
...,...,...
601,volume_1day_w5_min/volume_1day_w5_max,0.319409
602,tmos_close_1day_w5_min/tmos_close_1day_w5_max,0.973014
603,close_1day_w20_min/close_1day_w20_max,0.871535
604,volume_1day_w20_min/volume_1day_w20_max,0.116580


In [263]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [264]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

0
44190.071565    [index_1day, index]
Name: index, dtype: object

#### uniq_2

In [272]:
#проверка уникальности данных в колонках
(~df.columns.isin(['ticker', 'result'])).sum(), df[df.columns[~df.columns.isin(['ticker', 'result'])]].nunique().nunique()


(608, 522)

In [274]:
df_tmp = df[df.columns[~df.columns.isin(['ticker', 'result', 'time', 'delta_time'])]].nunique().reset_index()

In [275]:
df_tmp

Unnamed: 0,index,0
0,close,112916
1,index_1day,88179
2,income_rate,242654
3,res_price,99183
4,res_ind,572259
...,...,...
601,volume_1day_w5_min/volume_1day_w5_max,57650
602,tmos_close_1day_w5_min/tmos_close_1day_w5_max,537
603,close_1day_w20_min/close_1day_w20_max,38437
604,volume_1day_w20_min/volume_1day_w20_max,20784


In [278]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [280]:
pd.set_option('display.max_rows', 100)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,5,"[weekday, sin_time_weekday]"
1,8,"[tmos_close_1day_w20_lvl_1.01-1.015, tmos_clos..."
2,9,"[tmos_close_1day_w20_lvl_-0.995-1, tmos_close_..."
3,10,"[tmos_close_1day_w20_lvl_1-1.005, tmos_close_1..."
4,11,"[tmos_close_1day_w20_lvl_-0.96-0.97, tmos_clos..."
5,12,"[tmos_close_1day_w20_lvl_-0.97-0.98, tmos_clos..."
6,14,"[hour, sin_time_hour, close_1day_w20_lvl_-0.99..."
7,15,"[close_1day_w20_lvl_1.005-1.01, close_1day_w20..."
8,16,"[close_1day_w20_lvl_1.02-1.03, close_1day_w20_..."
9,17,"[close_1day_w20_lvl_1-1.005, close_1day_w20_lv..."


In [282]:
i = 43
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(87990,
 array(['volume_1day_w3_ma_low_3std/volume_1day',
        'volume_1day_w20_ma_low_3std/volume_1day'], dtype=object))

In [288]:
np.sort(df['close_1day_w100_mean_abs_pct'].unique())

array([0.        , 0.0003125 , 0.00034014, ..., 0.18763515, 0.26517856,
              nan], dtype=float32)