In [1]:
#!pip3 install torch torchvision torchaudio

In [None]:
#!python3 -m pip install tensorflow

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

### 1. Load data

In [4]:
dfs_1min = []
dfs_5min = []
dfs_1hour = []
dfs_1day = []

stocks = [
          'GAZP',
          'SBER',
          'LKOH',
          'MGNT',
          'NVTK',
          'SNGS',
          'GMKN',
          'ROSN',
          'NLMK',
          'TATN',
          'MTSS',
          'ALRS',  
          'YDEX',
          'CHMF',
          
          'MAGN',
          'TCSG',
          'OZON',
          'RUAL'  
         ]
for stock in tqdm(stocks):
    df_1min = load_pkl(f"./data/preproc/1min/{stock}.pkl")
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    
    dfs_1min += [df_1min.copy()]
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_1min = pd.concat(dfs_1min)
df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1min.reset_index(drop=True, inplace=True)
df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1min.shape, df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 74.52it/s]


((7847346, 7), (1607835, 7), (138247, 7), (9724, 7))

In [5]:
df_1min.head()

Unnamed: 0,time,open,close,volume,low,high,ticker
0,2022-10-03 10:00:00,219.0,219.93,87793.0,218.0,220.25,GAZP
1,2022-10-03 10:01:00,219.8,220.8,61549.0,219.7,220.55,GAZP
2,2022-10-03 10:02:00,220.24,219.34,52640.0,219.2,220.38,GAZP
3,2022-10-03 10:03:00,219.17,218.96,61506.0,217.7,219.4,GAZP
4,2022-10-03 10:04:00,218.82,218.3,51464.0,217.76,218.96,GAZP


### 2. Preproc data

#### 2.1 Make target

In [6]:
# Используем уже знакомую функцию :)

def get_target(df_all, ind, val_first, val_second, days_to_wait):
    #цель - достичь val_second, не достигнув val_first

    points_in_day = 840 # торговая сессия
    points_to_wait = days_to_wait * points_in_day
    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]
    
    if val_first < val_second:  
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else:
        mask_val_first = np.array(df['close'] > val_first)
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):
        return 'DNF', -1
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        return 'WIN', delta_time
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        return 'LOSE', delta_time

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            return 'LOSE', delta_time
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            return 'WIN', delta_time


def get_df_target(df, indx, percent_first=None, percent_second=None, days_to_wait=None):
    times = []
    results = []
    delta_times = []
    closes = []
    tickers = []
    
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time = get_target(df, ind, val_first, val_second, days_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times
                             })
    return df_result    

In [7]:
inds = np.arange(df_1min.shape[0])
inds.shape

(7847346,)

In [8]:
df_result = get_df_target(df_1min, inds, percent_first=0.995, percent_second=1.015, days_to_wait=1)

100%|█████████████████████████████████████████████████████████████| 7847346/7847346 [31:43<00:00, 4122.24it/s]


In [9]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,-1
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,-1
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,-1
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,-1


In [10]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.614319
WIN     0.246852
DNF     0.138829
Name: proportion, dtype: float64

In [11]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [12]:
!mkdir experiments

mkdir: experiments: File exists


In [13]:
!mkdir experiments/model_one_fc

mkdir: experiments/model_one_fc: File exists


In [14]:
dump_pkl(df_result, './experiments/model_one_fc/df_result_wait_1day.pkl')

#### 2.1.2 Загрузим датасет с ожиданием в 1 день

In [3]:
df_result = load_pkl('./experiments/model_one_fc/df_result_wait_1day.pkl')

In [4]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,-1
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,-1
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,-1
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,-1


In [5]:
df_result['delta_time'] = df_result['delta_time'].replace(-1, pd.Timedelta('100 days'))
df_result

  df_result['delta_time'] = df_result['delta_time'].replace(-1, pd.Timedelta('100 days'))


Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,100 days 00:00:00
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,100 days 00:00:00
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,100 days 00:00:00
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,100 days 00:00:00


In [6]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.614319
WIN     0.246852
DNF     0.138829
Name: proportion, dtype: float64

In [8]:
# mask = df_result['delta_time'] > pd.Timedelta('4 hours')
# df_result.loc[mask, 'result'] = 'DNF'


In [9]:
# df_result['result'].value_counts(normalize=True)

#### 2.2 Link data of different time-period

In [17]:
dfs_1min = []
dfs_5min = []
dfs_1hour = []
dfs_1day = []

stocks = [
          'GAZP',
          'SBER',
          'LKOH',
          'MGNT',
          'NVTK',
          'SNGS',
          'GMKN',
          'ROSN',
          'NLMK',
          'TATN',
          'MTSS',
          'ALRS',  
          'YDEX',
          'CHMF',
          
          'MAGN',
          'TCSG',
          'OZON',
          'RUAL'  
         ]
for stock in tqdm(stocks):
    df_1min = load_pkl(f"./data/preproc/1min/{stock}.pkl")
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")

    df_1min['index_1min_start'] = None
    df_1min.loc[0, 'index_1min_start'] = 42
    df_5min['index_start'] = None
    df_5min.loc[0, 'index_start'] = 42
    df_1hour['index_start'] = None
    df_1hour.loc[0, 'index_start'] = 42
    df_1day['index_start'] = None
    df_1day.loc[0, 'index_start'] = 42

    dfs_1min += [df_1min.copy()]
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_1min = pd.concat(dfs_1min)
df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1min.reset_index(drop=True, inplace=True)
df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1min.shape, df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 37.97it/s]


((7847346, 8), (1607835, 8), (138247, 8), (9724, 8))

In [18]:
pd.set_option('future.no_silent_downcasting', True)

#start index in each df-time
#1min
mask = df_1min['index_1min_start'] == 42
df_1min.loc[mask, 'index_1min_start'] = df_1min.index[mask]
df_1min['index_1min_start'] = df_1min['index_1min_start'].ffill()

#5min
mask = df_5min['index_start'] == 42
df_5min.loc[mask, 'index_start'] = df_5min.index[mask]
df_5min['index_start'] = df_5min['index_start'].ffill()

#1hour
mask = df_1hour['index_start'] == 42
df_1hour.loc[mask, 'index_start'] = df_1hour.index[mask]
df_1hour['index_start'] = df_1hour['index_start'].ffill()

#1day
mask = df_1day['index_start'] == 42
df_1day.loc[mask, 'index_start'] = df_1day.index[mask]
df_1day['index_start'] = df_1day['index_start'].ffill()

In [19]:
#link different time-step date

df_5min = df_5min.reset_index()
df_1hour = df_1hour.reset_index()
df_1day = df_1day.reset_index()

df_5min = df_5min.rename(columns={col : col+'_5min' for col in df_5min.columns if col not in ['time', 'ticker']})
df_1hour = df_1hour.rename(columns={col : col+'_1hour' for col in df_1hour.columns if col not in ['time', 'ticker']})
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['time', 'ticker']})

df_1min['time'] += pd.Timedelta('1min')
df_5min['time'] += pd.Timedelta('5min')
df_1hour['time'] += pd.Timedelta('1hour')
#1) биржа работает до 23:50 
mask = datetime.time(0, 0) == pd.to_datetime(df_1hour['time'], format='%H:%M').dt.time
df_1hour.loc[mask, 'time'] -= pd.Timedelta('10min')
#2)и клирингс 18:50 до 19:05
mask = datetime.time(19, 0) == pd.to_datetime(df_1hour['time'], format='%H:%M').dt.time
df_1hour.loc[mask, 'time'] -= pd.Timedelta('10min')
# end
df_1day['time'] += pd.Timedelta('20:50:00')


df = df_1min.merge(df_5min, on=['time', 'ticker'], how='left')
df = df.merge(df_1hour, on=['time', 'ticker'], how='left')
df = df.merge(df_1day, on=['time', 'ticker'], how='left')

#Проверка, что данные все данные подтянулись
print(f'Пересесчение в данных 1min и 5min: {df['close_5min'].notnull().sum() / df_5min.shape[0]}')
print(f'Пересесчение в данных 1min и 1hour: {df['close_1hour'].notnull().sum() / df_1hour.shape[0]}')
print(f'Пересесчение в данных 1min и 1day: {df['close_1day'].notnull().sum() / df_1day.shape[0]}')
#в минутных данных бывают пропуски, это можно проинтерполировать. Но пока просто игнорируем, вроде как несущественный эффект должен быть

#Проверка на равенство значений
count_diff = ((df['close_5min'].notnull()) & (df['close'] != df['close_5min'])).sum()
assert count_diff == 0, f'Error: close & close_5min, count diff values: {count_diff}' 
count_diff = ((df['close_1hour'].notnull()) & (df['close'] != df['close_1hour'])).sum()
assert count_diff < 200, f'Error: close & close_1hour, count diff values: {count_diff}' #тут есть празничные торги они идут без вечернего клиринга и без вечерних торгов
count_diff = ((df['close_1day'].notnull()) & (df['close'] != df['close_1day'])).sum()
assert count_diff == 0, f'Error: close & close_1day, count diff values: {count_diff}' 

#Проверка что не наджоинилось лишнего
assert df_1min.shape[0] == df.shape[0], 'Error: with join dimensions'

Пересесчение в данных 1min и 5min: 0.9804569498735878
Пересесчение в данных 1min и 1hour: 0.9674929654893054
Пересесчение в данных 1min и 1day: 0.989613327848622


In [20]:
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    
    cols_ffill = ['index_1min_start', 'index_start_5min', 'index_5min', 'index_start_1hour', 'index_1hour', 'index_start_1day', 'index_1day']
    df.loc[mask_stock, cols_ffill] = df.loc[mask_stock, cols_ffill].ffill()

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:10<00:00,  1.77it/s]


In [21]:
df

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,low_1hour,high_1hour,index_start_1hour,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,index_start_1day
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,...,,,,,,,,,,
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,...,,,,,,,,,,
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,...,,,,,,,,,,
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,...,,,,,,,,,,
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,219.000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181


#### 2.4 Union target and features. Make file to train

In [24]:
(df_result['time']+pd.Timedelta('1min') == df['time']).all(), (df_result['close'] == df['close']).all()

(True, True)

In [26]:
#union
df = pd.concat([df, df_result[['result', 'delta_time']]], axis=1)
df

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,index_start_1hour,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,index_start_1day,result,delta_time
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:04:00
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:01:00
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:03:00
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:02:00
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,219.000,...,,,,,,,,,LOSE,0 days 00:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00


### 2.5 Feature engineering

In [31]:
df

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,index_start_1hour,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,index_start_1day,result,delta_time
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:04:00
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:01:00
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:03:00
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:02:00
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,219.000,...,,,,,,,,,LOSE,0 days 00:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00


In [32]:
df_5min.head()

Unnamed: 0,index_5min,time,open_5min,close_5min,volume_5min,low_5min,high_5min,ticker,index_start_5min
0,0,2022-10-03 10:05:00,219.0,218.3,314952.0,217.7,220.55,GAZP,0
1,1,2022-10-03 10:10:00,218.3,218.5,182525.0,216.19,218.78,GAZP,0
2,2,2022-10-03 10:15:00,218.11,215.86,193758.0,215.31,218.13,GAZP,0
3,3,2022-10-03 10:20:00,215.83,216.85,91747.0,215.7,217.39,GAZP,0
4,4,2022-10-03 10:25:00,216.85,216.58,52824.0,216.4,217.16,GAZP,0


In [35]:
df_1hour.head()

Unnamed: 0,index_1hour,time,open_1hour,close_1hour,volume_1hour,low_1hour,high_1hour,ticker,index_start_1hour
0,0,2022-10-03 11:00:00,219.0,215.8,1623411.0,213.63,220.55,GAZP,0
1,1,2022-10-03 12:00:00,215.8,216.36,411786.0,214.96,216.78,GAZP,0
2,2,2022-10-03 13:00:00,216.34,215.88,237084.0,215.43,216.43,GAZP,0
3,3,2022-10-03 14:00:00,215.85,217.1,573509.0,215.53,217.77,GAZP,0
4,4,2022-10-03 15:00:00,217.12,216.94,364609.0,216.5,217.49,GAZP,0


In [36]:
df_1day.head()

Unnamed: 0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,index_start_1day
0,0,2022-09-30 23:50:00,231.68,217.7,24588842.0,189.42,238.72,GAZP,0
1,1,2022-10-03 23:50:00,219.0,215.83,5070201.0,213.63,220.55,GAZP,0
2,2,2022-10-04 23:50:00,216.48,210.72,4975859.0,208.8,216.7,GAZP,0
3,3,2022-10-05 23:50:00,211.5,209.5,5322122.0,202.85,211.5,GAZP,0
4,4,2022-10-06 23:50:00,210.0,212.86,5165564.0,209.6,216.88,GAZP,0


In [38]:
def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=1).mean().values
    rolling_std = data.rolling(window=window, min_periods=1).std().values
    #upper_band = rolling_mean + (rolling_std * num_of_std)
    #lower_band = rolling_mean - (rolling_std * num_of_std)
    return rolling_mean, rolling_std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change."""
    roc = ((data - data.shift(periods)) / data.shift(periods))
    return roc.values



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std = calculate_bollinger_bands(data, window)

    #min, max
    rolling_min = data.rolling(window=window, min_periods=1).min().values
    rolling_max = data.rolling(window=window, min_periods=1).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window)
    diff = data.diff(window).values

    # #diff 1
    # diff = data.diff(1).values
    # percent_change = data.pct_change().values



    return pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        f'{feat_name}_diff' : diff,
                        })
def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : data.pct_change(periods=1).values,
                        f'{feat_name}_diff' : data.diff(1).values,
                        })

#### 1min

In [52]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    df_close_w1 = calc_stats_diff_1(df_ticker['close'], feat_name='close_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume'], feat_name='volume_w1')
    
    df_close_w60 = calc_stats(df_ticker['close'], window=60, feat_name='close_w60')
    df_volume_w60 = calc_stats(df_ticker['volume'], window=60, feat_name='volume_w60')

    df_close_w180 = calc_stats(df_ticker['close'], window=180, feat_name='close_w180')
    df_volume_w180 = calc_stats(df_ticker['volume'], window=180, feat_name='volume_w180')

    
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1,
                       df_close_w60, df_volume_w60,
                       df_close_w180, df_volume_w180], axis=1)]

df_fe = pd.concat(dfs).set_index('index')


100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:08<00:00,  2.22it/s]


In [61]:
df_fe.head()

Unnamed: 0_level_0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,close_w180_rsi,close_w180_roc,close_w180_diff,volume_w180_ma,volume_w180_std,volume_w180_min,volume_w180_max,volume_w180_rsi,volume_w180_roc,volume_w180_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-10-03 10:01:00,219.0,219.93,87793.0,218.0,220.25,GAZP,0,,,...,,,,87793.0,,87793.0,87793.0,,,
1,2022-10-03 10:02:00,219.8,220.8,61549.0,219.7,220.55,GAZP,0,,,...,100.0,,,74671.0,18557.310365,61549.0,87793.0,0.0,,
2,2022-10-03 10:03:00,220.24,219.34,52640.0,219.2,220.38,GAZP,0,,,...,37.339056,,,67327.333333,18274.988491,52640.0,87793.0,0.0,,
3,2022-10-03 10:04:00,219.17,218.96,61506.0,217.7,219.4,GAZP,0,,,...,32.103321,,,65872.0,15202.700966,52640.0,87793.0,20.141303,,
4,2022-10-03 10:05:00,218.82,218.3,51464.0,217.76,218.96,GAZP,0,0.0,219.0,...,25.816024,,,62990.4,14658.092656,51464.0,87793.0,16.399993,,


In [62]:
(df_fe['close'] == df['close']).all()

True

#### 5min

In [69]:
dfs = []
for ticker in tqdm(df_5min['ticker'].unique()):
    mask = np.array(df_5min['ticker'] == ticker)
    df_ticker = df_5min.loc[mask].copy().reset_index()

    df_close_w1 = calc_stats_diff_1(df_ticker['close_5min'], feat_name='close_5min_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume_5min'], feat_name='volume_5min_w1')
    
    df_close_w12 = calc_stats(df_ticker['close_5min'], window=12, feat_name='close_5min_w12')
    df_volume_w12 = calc_stats(df_ticker['volume_5min'], window=12, feat_name='volume_5min_w12')

    df_close_w168 = calc_stats(df_ticker['close_5min'], window=12*14, feat_name='close_5min_w168')
    df_volume_w168 = calc_stats(df_ticker['volume_5min'], window=12*14, feat_name='volume_5min_w168')

    
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1,
                       df_close_w12, df_volume_w12,
                       df_close_w168, df_volume_w168], axis=1)]

df_5min_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:01<00:00, 12.01it/s]


In [72]:
df_5min_fe.head()

Unnamed: 0_level_0,index_5min,time,open_5min,close_5min,volume_5min,low_5min,high_5min,ticker,index_start_5min,close_5min_w1_roc,...,close_5min_w168_rsi,close_5min_w168_roc,close_5min_w168_diff,volume_5min_w168_ma,volume_5min_w168_std,volume_5min_w168_min,volume_5min_w168_max,volume_5min_w168_rsi,volume_5min_w168_roc,volume_5min_w168_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-10-03 10:05:00,219.0,218.3,314952.0,217.7,220.55,GAZP,0,,...,,,,314952.0,,314952.0,314952.0,,,
1,1,2022-10-03 10:10:00,218.3,218.5,182525.0,216.19,218.78,GAZP,0,0.000916,...,100.0,,,248738.5,93640.029712,182525.0,314952.0,0.0,,
2,2,2022-10-03 10:15:00,218.11,215.86,193758.0,215.31,218.13,GAZP,0,-0.012082,...,7.042254,,,230411.666667,73429.190669,182525.0,314952.0,7.819156,,
3,3,2022-10-03 10:20:00,215.83,216.85,91747.0,215.7,217.39,GAZP,0,0.004586,...,31.070496,,,195745.5,91659.895667,91747.0,314952.0,4.572375,,
4,4,2022-10-03 10:25:00,216.85,216.58,52824.0,216.4,217.16,GAZP,0,-0.001245,...,29.02439,,,167161.2,101913.999959,52824.0,314952.0,3.947026,,


In [74]:
(df_5min_fe['close_5min'] == df_5min['close_5min']).all(), (df_5min_fe.index == df_5min_fe['index_5min']).all()

(True, True)

#### 1hour

In [78]:
dfs = []
for ticker in tqdm(df_1hour['ticker'].unique()):
    mask = np.array(df_1hour['ticker'] == ticker)
    df_ticker = df_1hour.loc[mask].copy().reset_index()

    df_close_w1 = calc_stats_diff_1(df_ticker['close_1hour'], feat_name='close_1hour_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume_1hour'], feat_name='volume_1hour_w1')
    
    df_close_w14 = calc_stats(df_ticker['close_1hour'], window=14, feat_name='close_1hour_w14')
    df_volume_w14 = calc_stats(df_ticker['volume_1hour'], window=14, feat_name='volume_1hour_w14')

    df_close_w70 = calc_stats(df_ticker['close_1hour'], window=14*5, feat_name='close_1hour_w70')
    df_volume_w70 = calc_stats(df_ticker['volume_1hour'], window=14*5, feat_name='volume_1hour_w70')

    
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1,
                       df_close_w14, df_volume_w14,
                       df_close_w70, df_volume_w70], axis=1)]

df_1hour_fe = pd.concat(dfs).set_index('index')


100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 90.87it/s]


In [80]:
df_1hour_fe.head()

Unnamed: 0_level_0,index_1hour,time,open_1hour,close_1hour,volume_1hour,low_1hour,high_1hour,ticker,index_start_1hour,close_1hour_w1_roc,...,close_1hour_w70_rsi,close_1hour_w70_roc,close_1hour_w70_diff,volume_1hour_w70_ma,volume_1hour_w70_std,volume_1hour_w70_min,volume_1hour_w70_max,volume_1hour_w70_rsi,volume_1hour_w70_roc,volume_1hour_w70_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-10-03 11:00:00,219.0,215.8,1623411.0,213.63,220.55,GAZP,0,,...,,,,1623411.0,,1623411.0,1623411.0,,,
1,1,2022-10-03 12:00:00,215.8,216.36,411786.0,214.96,216.78,GAZP,0,0.002595,...,100.0,,,1017598.5,856748.253755,411786.0,1623411.0,0.0,,
2,2,2022-10-03 13:00:00,216.34,215.88,237084.0,215.43,216.43,GAZP,0,-0.002219,...,53.846154,,,757427.0,755034.047837,237084.0,1623411.0,0.0,,
3,3,2022-10-03 14:00:00,215.85,217.1,573509.0,215.53,217.77,GAZP,0,0.005651,...,78.761062,,,711447.5,623303.617784,237084.0,1623411.0,19.528348,,
4,4,2022-10-03 15:00:00,217.12,216.94,364609.0,216.5,217.49,GAZP,0,-0.000737,...,73.553719,,,642079.8,561640.400055,237084.0,1623411.0,17.416439,,


In [82]:
(df_1hour_fe['close_1hour'] == df_1hour['close_1hour']).all(), (df_1hour_fe.index == df_1hour_fe['index_1hour']).all()

(True, True)

#### 1day

In [87]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    df_close_w1 = calc_stats_diff_1(df_ticker['close_1day'], feat_name='close_1day_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume_1day'], feat_name='volume_1day_w1')
    
    df_close_w5 = calc_stats(df_ticker['close_1day'], window=5, feat_name='close_1day_w5')
    df_volume_w5 = calc_stats(df_ticker['volume_1day'], window=5, feat_name='volume_1day_w5')

    df_close_w20 = calc_stats(df_ticker['close_1day'], window=5*4, feat_name='close_1day_w20')
    df_volume_w20 = calc_stats(df_ticker['volume_1day'], window=5*4, feat_name='volume_1day_w20')

    df_close_w200 = calc_stats(df_ticker['close_1day'], window=200, feat_name='close_1day_w200')
    df_volume_w200 = calc_stats(df_ticker['volume_1day'], window=200, feat_name='volume_1day_w200')

    
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1,
                       df_close_w5, df_volume_w5,
                       df_close_w20, df_volume_w20,
                       df_close_w200, df_volume_w200], axis=1)]

df_1day_fe = pd.concat(dfs).set_index('index')


100%|████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 150.97it/s]


In [90]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,index_start_1day,close_1day_w1_roc,...,close_1day_w200_rsi,close_1day_w200_roc,close_1day_w200_diff,volume_1day_w200_ma,volume_1day_w200_std,volume_1day_w200_min,volume_1day_w200_max,volume_1day_w200_rsi,volume_1day_w200_roc,volume_1day_w200_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-09-30 23:50:00,231.68,217.7,24588842.0,189.42,238.72,GAZP,0,,...,,,,24588840.0,,24588842.0,24588842.0,,,
1,1,2022-10-03 23:50:00,219.0,215.83,5070201.0,213.63,220.55,GAZP,0,-0.00859,...,0.0,,,14829520.0,13801760.0,5070201.0,24588842.0,0.0,,
2,2,2022-10-04 23:50:00,216.48,210.72,4975859.0,208.8,216.7,GAZP,0,-0.023676,...,0.0,,,11544970.0,11296430.0,4975859.0,24588842.0,0.0,,
3,3,2022-10-05 23:50:00,211.5,209.5,5322122.0,202.85,211.5,GAZP,0,-0.00579,...,0.0,,,9989256.0,9734155.0,4975859.0,24588842.0,1.73485,,
4,4,2022-10-06 23:50:00,210.0,212.86,5165564.0,209.6,216.88,GAZP,0,0.016038,...,29.065744,,,9024518.0,8701662.0,4975859.0,24588842.0,1.721348,,


In [92]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index == df_1day_fe['index_1day']).all()

(True, True)

### time features

#### 1hour

In [99]:
df_1hour_fe.iloc[:15]

Unnamed: 0_level_0,index_1hour,time,open_1hour,close_1hour,volume_1hour,low_1hour,high_1hour,ticker,index_start_1hour,close_1hour_w1_roc,...,close_1hour_w70_rsi,close_1hour_w70_roc,close_1hour_w70_diff,volume_1hour_w70_ma,volume_1hour_w70_std,volume_1hour_w70_min,volume_1hour_w70_max,volume_1hour_w70_rsi,volume_1hour_w70_roc,volume_1hour_w70_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-10-03 11:00:00,219.0,215.8,1623411.0,213.63,220.55,GAZP,0,,...,,,,1623411.0,,1623411.0,1623411.0,,,
1,1,2022-10-03 12:00:00,215.8,216.36,411786.0,214.96,216.78,GAZP,0,0.002595,...,100.0,,,1017598.0,856748.253755,411786.0,1623411.0,0.0,,
2,2,2022-10-03 13:00:00,216.34,215.88,237084.0,215.43,216.43,GAZP,0,-0.002219,...,53.846154,,,757427.0,755034.047837,237084.0,1623411.0,0.0,,
3,3,2022-10-03 14:00:00,215.85,217.1,573509.0,215.53,217.77,GAZP,0,0.005651,...,78.761062,,,711447.5,623303.617784,237084.0,1623411.0,19.528348,,
4,4,2022-10-03 15:00:00,217.12,216.94,364609.0,216.5,217.49,GAZP,0,-0.000737,...,73.553719,,,642079.8,561640.400055,237084.0,1623411.0,17.416439,,
5,5,2022-10-03 16:00:00,216.94,216.54,337495.0,216.0,217.25,GAZP,0,-0.001844,...,63.120567,,,591315.7,517507.424659,237084.0,1623411.0,17.175354,,
6,6,2022-10-03 17:00:00,216.56,215.91,333448.0,215.81,216.68,GAZP,0,-0.002909,...,51.594203,,,554477.4,482366.737365,237084.0,1623411.0,17.139942,,
7,7,2022-10-03 18:00:00,215.91,215.3,556993.0,214.5,215.98,GAZP,0,-0.002825,...,43.842365,,,554791.9,446585.706537,237084.0,1623411.0,25.611999,,
8,8,2022-10-03 18:50:00,215.3,215.13,192518.0,214.87,215.3,GAZP,0,-0.00079,...,42.080378,,,514539.2,434846.442579,192518.0,1623411.0,21.952437,,
9,9,2022-10-03 20:00:00,215.5,215.25,84423.0,214.85,215.79,GAZP,0,0.000558,...,43.678161,,,471527.6,431950.536925,84423.0,1623411.0,21.059991,,


In [101]:
time_cyclic = (df_1hour_fe['time'] - pd.to_datetime(df_1hour_fe['time'].dt.date) - pd.Timedelta('11:00:00')) / pd.Timedelta('12:50:00')
df_1hour_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
df_1hour_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

df_1hour_fe

Unnamed: 0_level_0,index_1hour,time,open_1hour,close_1hour,volume_1hour,low_1hour,high_1hour,ticker,index_start_1hour,close_1hour_w1_roc,...,close_1hour_w70_diff,volume_1hour_w70_ma,volume_1hour_w70_std,volume_1hour_w70_min,volume_1hour_w70_max,volume_1hour_w70_rsi,volume_1hour_w70_roc,volume_1hour_w70_diff,sin_time_hour,cos_time_hour
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-10-03 11:00:00,219.000,215.800,1623411.0,213.630,220.550,GAZP,0,,...,,1.623411e+06,,1623411.0,1623411.0,,,,0.000000e+00,1.000000
1,1,2022-10-03 12:00:00,215.800,216.360,411786.0,214.960,216.780,GAZP,0,0.002595,...,,1.017598e+06,856748.253755,411786.0,1623411.0,0.000000,,,4.702719e-01,0.882522
2,2,2022-10-03 13:00:00,216.340,215.880,237084.0,215.430,216.430,GAZP,0,-0.002219,...,,7.574270e+05,755034.047837,237084.0,1623411.0,0.000000,,,8.300502e-01,0.557689
3,3,2022-10-03 14:00:00,215.850,217.100,573509.0,215.530,217.770,GAZP,0,0.005651,...,,7.114475e+05,623303.617784,237084.0,1623411.0,19.528348,,,9.948025e-01,0.101823
4,4,2022-10-03 15:00:00,217.120,216.940,364609.0,216.500,217.490,GAZP,0,-0.000737,...,,6.420798e+05,561640.400055,237084.0,1623411.0,17.416439,,,9.258192e-01,-0.377967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138242,138242,2024-11-19 20:00:00,38.470,38.555,81823.0,38.435,38.720,RUAL,130521,0.001559,...,1.405,2.340991e+05,379091.244405,7142.0,2747722.0,50.208076,1.180261,44294.0,-9.535464e-01,-0.301246
138243,138243,2024-11-19 21:00:00,38.560,38.710,68970.0,38.440,38.710,RUAL,130521,0.004020,...,1.575,2.346631e+05,378811.718532,7142.0,2747722.0,50.185359,1.338442,39476.0,-9.831929e-01,0.182570
138244,138244,2024-11-19 22:00:00,38.700,38.570,35080.0,38.570,38.725,RUAL,130521,-0.003617,...,1.420,2.349590e+05,378645.173279,7142.0,2747722.0,50.097101,1.442217,20716.0,-7.818315e-01,0.623490
138245,138245,2024-11-19 23:00:00,38.585,38.540,18640.0,38.515,38.605,RUAL,130521,-0.000778,...,1.435,2.347905e+05,378740.096147,7142.0,2747722.0,49.944730,-0.387487,-11792.0,-3.967734e-01,0.917917


#### 1day

In [107]:
day_of_week_cyclic = np.minimum(df_1day_fe['time'].dt.dayofweek, 4) / 4
df_1day_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
df_1day_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,index_start_1day,close_1day_w1_roc,...,close_1day_w200_diff,volume_1day_w200_ma,volume_1day_w200_std,volume_1day_w200_min,volume_1day_w200_max,volume_1day_w200_rsi,volume_1day_w200_roc,volume_1day_w200_diff,sin_time_weekday,cos_time_weekday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-09-30 23:50:00,231.68,217.7,24588842.0,189.42,238.72,GAZP,0,,...,,24588840.0,,24588842.0,24588842.0,,,,-2.449294e-16,1.0
1,1,2022-10-03 23:50:00,219.0,215.83,5070201.0,213.63,220.55,GAZP,0,-0.00859,...,,14829520.0,13801760.0,5070201.0,24588842.0,0.0,,,0.0,1.0
2,2,2022-10-04 23:50:00,216.48,210.72,4975859.0,208.8,216.7,GAZP,0,-0.023676,...,,11544970.0,11296430.0,4975859.0,24588842.0,0.0,,,1.0,6.123234000000001e-17
3,3,2022-10-05 23:50:00,211.5,209.5,5322122.0,202.85,211.5,GAZP,0,-0.00579,...,,9989256.0,9734155.0,4975859.0,24588842.0,1.73485,,,1.224647e-16,-1.0
4,4,2022-10-06 23:50:00,210.0,212.86,5165564.0,209.6,216.88,GAZP,0,0.016038,...,,9024518.0,8701662.0,4975859.0,24588842.0,1.721348,,,-1.0,-1.83697e-16


## 2.6 Save data

In [237]:
df_fe.columns

Index(['time', 'open', 'close', 'volume', 'low', 'high', 'ticker',
       'index_1min_start', 'index_5min', 'open_5min', 'close_5min',
       'volume_5min', 'low_5min', 'high_5min', 'index_start_5min',
       'index_1hour', 'open_1hour', 'close_1hour', 'volume_1hour', 'low_1hour',
       'high_1hour', 'index_start_1hour', 'index_1day', 'open_1day',
       'close_1day', 'volume_1day', 'low_1day', 'high_1day',
       'index_start_1day', 'result', 'delta_time', 'close_w1_roc',
       'close_w1_diff', 'volume_w1_roc', 'volume_w1_diff', 'close_w60_ma',
       'close_w60_std', 'close_w60_min', 'close_w60_max', 'close_w60_rsi',
       'close_w60_roc', 'close_w60_diff', 'volume_w60_ma', 'volume_w60_std',
       'volume_w60_min', 'volume_w60_max', 'volume_w60_rsi', 'volume_w60_roc',
       'volume_w60_diff', 'close_w180_ma', 'close_w180_std', 'close_w180_min',
       'close_w180_max', 'close_w180_rsi', 'close_w180_roc', 'close_w180_diff',
       'volume_w180_ma', 'volume_w180_std', 'volume_w180

In [240]:
need_cols = ['time',
             'close', 'volume',
            
             'close_w1_roc',
               'close_w1_diff', 'volume_w1_roc', 'volume_w1_diff', 'close_w60_ma',
               'close_w60_std', 'close_w60_min', 'close_w60_max', 'close_w60_rsi',
               'close_w60_roc', 'close_w60_diff', 'volume_w60_ma', 'volume_w60_std',
               'volume_w60_min', 'volume_w60_max', 'volume_w60_rsi', 'volume_w60_roc',
               'volume_w60_diff', 'close_w180_ma', 'close_w180_std', 'close_w180_min',
               'close_w180_max', 'close_w180_rsi', 'close_w180_roc', 'close_w180_diff',
               'volume_w180_ma', 'volume_w180_std', 'volume_w180_min',
               'volume_w180_max', 'volume_w180_rsi', 'volume_w180_roc',
               'volume_w180_diff',
             
             'ticker',
             'index_1min_start',
             'index_5min', 'index_start_5min',
             'index_1hour', 'index_start_1hour',
             'index_1day', 'index_start_1day',
             'result', 'delta_time' 
            ]
df_fe[need_cols]

Unnamed: 0_level_0,time,close,volume,close_w1_roc,close_w1_diff,volume_w1_roc,volume_w1_diff,close_w60_ma,close_w60_std,close_w60_min,...,ticker,index_1min_start,index_5min,index_start_5min,index_1hour,index_start_1hour,index_1day,index_start_1day,result,delta_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-10-03 10:01:00,219.930,87793.0,,,,,219.930000,,219.930,...,GAZP,0,,,,,,,LOSE,0 days 00:04:00
1,2022-10-03 10:02:00,220.800,61549.0,0.003956,0.870,-0.298930,-26244.0,220.365000,0.615183,219.930,...,GAZP,0,,,,,,,LOSE,0 days 00:01:00
2,2022-10-03 10:03:00,219.340,52640.0,-0.006612,-1.460,-0.144746,-8909.0,220.023333,0.734461,219.340,...,GAZP,0,,,,,,,LOSE,0 days 00:03:00
3,2022-10-03 10:04:00,218.960,61506.0,-0.001732,-0.380,0.168427,8866.0,219.757500,0.801431,218.960,...,GAZP,0,,,,,,,LOSE,0 days 00:02:00
4,2022-10-03 10:05:00,218.300,51464.0,-0.003014,-0.660,-0.163269,-10042.0,219.466000,0.952145,218.300,...,GAZP,0,0.0,0,,,,,LOSE,0 days 00:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.455,1020.0,-0.000780,-0.030,-0.754513,-3135.0,38.507417,0.044802,38.435,...,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847342,2024-11-19 23:47:00,38.510,1102.0,0.001430,0.055,0.080392,82.0,38.506667,0.044366,38.435,...,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847343,2024-11-19 23:48:00,38.520,665.0,0.000260,0.010,-0.396552,-437.0,38.506000,0.043849,38.435,...,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847344,2024-11-19 23:49:00,38.520,781.0,0.000000,0.000,0.174436,116.0,38.505333,0.043315,38.435,...,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00


In [242]:
for col in tqdm(need_cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)

 16%|███████████▌                                                              | 7/45 [00:00<00:00, 63.99it/s]

time


 82%|████████████████████████████████████████████████████████████             | 37/45 [00:00<00:00, 48.29it/s]

ticker


100%|█████████████████████████████████████████████████████████████████████████| 45/45 [00:01<00:00, 37.18it/s]

result
delta_time





In [115]:
!mkdir experiments/feat_engin/data

mkdir: experiments/feat_engin: File exists


In [244]:
#save
dump_pkl(df_fe[need_cols], 'experiments/feat_engin/data/data_shift_time_1min.pkl')

In [247]:
(~df_fe.columns.isin(['ticker', 'result'])).sum(), df_fe[df_fe.columns[~df_fe.columns.isin(['ticker', 'result'])]].mean().nunique()


(61, 61)

In [250]:
#5min

In [252]:
df_5min_fe.columns

Index(['index_5min', 'time', 'open_5min', 'close_5min', 'volume_5min',
       'low_5min', 'high_5min', 'ticker', 'index_start_5min',
       'close_5min_w1_roc', 'close_5min_w1_diff', 'volume_5min_w1_roc',
       'volume_5min_w1_diff', 'close_5min_w12_ma', 'close_5min_w12_std',
       'close_5min_w12_min', 'close_5min_w12_max', 'close_5min_w12_rsi',
       'close_5min_w12_roc', 'close_5min_w12_diff', 'volume_5min_w12_ma',
       'volume_5min_w12_std', 'volume_5min_w12_min', 'volume_5min_w12_max',
       'volume_5min_w12_rsi', 'volume_5min_w12_roc', 'volume_5min_w12_diff',
       'close_5min_w168_ma', 'close_5min_w168_std', 'close_5min_w168_min',
       'close_5min_w168_max', 'close_5min_w168_rsi', 'close_5min_w168_roc',
       'close_5min_w168_diff', 'volume_5min_w168_ma', 'volume_5min_w168_std',
       'volume_5min_w168_min', 'volume_5min_w168_max', 'volume_5min_w168_rsi',
       'volume_5min_w168_roc', 'volume_5min_w168_diff'],
      dtype='object')

In [254]:
need_cols = ['time', 
             'close_5min', 'volume_5min',

        'close_5min_w1_roc', 'close_5min_w1_diff', 'volume_5min_w1_roc',
       'volume_5min_w1_diff', 'close_5min_w12_ma', 'close_5min_w12_std',
       'close_5min_w12_min', 'close_5min_w12_max', 'close_5min_w12_rsi',
       'close_5min_w12_roc', 'close_5min_w12_diff', 'volume_5min_w12_ma',
       'volume_5min_w12_std', 'volume_5min_w12_min', 'volume_5min_w12_max',
       'volume_5min_w12_rsi', 'volume_5min_w12_roc', 'volume_5min_w12_diff',
       'close_5min_w168_ma', 'close_5min_w168_std', 'close_5min_w168_min',
       'close_5min_w168_max', 'close_5min_w168_rsi', 'close_5min_w168_roc',
       'close_5min_w168_diff', 'volume_5min_w168_ma', 'volume_5min_w168_std',
       'volume_5min_w168_min', 'volume_5min_w168_max', 'volume_5min_w168_rsi',
       'volume_5min_w168_roc', 'volume_5min_w168_diff',
             
             'ticker']

for col in tqdm(need_cols):
    try:
        df_5min_fe[col] = df_5min_fe[col].astype(np.float32)
    except:
        print(col)

dump_pkl(df_5min_fe[need_cols], 'experiments/feat_engin/data/data_shift_time_5min.pkl')

 53%|██████████████████████████████████████                                  | 19/36 [00:00<00:00, 173.59it/s]

time


100%|████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 153.73it/s]


ticker


In [257]:
(~df_5min_fe.columns.isin(['ticker', 'result'])).sum(), df_5min_fe[df_5min_fe.columns[~df_5min_fe.columns.isin(['ticker', 'result'])]].mean().nunique()


(40, 40)

In [260]:
#1hour

In [262]:
df_1hour_fe.columns

Index(['index_1hour', 'time', 'open_1hour', 'close_1hour', 'volume_1hour',
       'low_1hour', 'high_1hour', 'ticker', 'index_start_1hour',
       'close_1hour_w1_roc', 'close_1hour_w1_diff', 'volume_1hour_w1_roc',
       'volume_1hour_w1_diff', 'close_1hour_w14_ma', 'close_1hour_w14_std',
       'close_1hour_w14_min', 'close_1hour_w14_max', 'close_1hour_w14_rsi',
       'close_1hour_w14_roc', 'close_1hour_w14_diff', 'volume_1hour_w14_ma',
       'volume_1hour_w14_std', 'volume_1hour_w14_min', 'volume_1hour_w14_max',
       'volume_1hour_w14_rsi', 'volume_1hour_w14_roc', 'volume_1hour_w14_diff',
       'close_1hour_w70_ma', 'close_1hour_w70_std', 'close_1hour_w70_min',
       'close_1hour_w70_max', 'close_1hour_w70_rsi', 'close_1hour_w70_roc',
       'close_1hour_w70_diff', 'volume_1hour_w70_ma', 'volume_1hour_w70_std',
       'volume_1hour_w70_min', 'volume_1hour_w70_max', 'volume_1hour_w70_rsi',
       'volume_1hour_w70_roc', 'volume_1hour_w70_diff', 'sin_time_hour',
       'cos_time

In [264]:
need_cols = ['time',
             'sin_time_hour', 'cos_time_hour',
             'close_1hour', 'volume_1hour',

        'close_1hour_w1_roc', 'close_1hour_w1_diff', 'volume_1hour_w1_roc',
       'volume_1hour_w1_diff', 'close_1hour_w14_ma', 'close_1hour_w14_std',
       'close_1hour_w14_min', 'close_1hour_w14_max', 'close_1hour_w14_rsi',
       'close_1hour_w14_roc', 'close_1hour_w14_diff', 'volume_1hour_w14_ma',
       'volume_1hour_w14_std', 'volume_1hour_w14_min', 'volume_1hour_w14_max',
       'volume_1hour_w14_rsi', 'volume_1hour_w14_roc', 'volume_1hour_w14_diff',
       'close_1hour_w70_ma', 'close_1hour_w70_std', 'close_1hour_w70_min',
       'close_1hour_w70_max', 'close_1hour_w70_rsi', 'close_1hour_w70_roc',
       'close_1hour_w70_diff', 'volume_1hour_w70_ma', 'volume_1hour_w70_std',
       'volume_1hour_w70_min', 'volume_1hour_w70_max', 'volume_1hour_w70_rsi',
       'volume_1hour_w70_roc', 'volume_1hour_w70_diff',
             
             'ticker']

for col in tqdm(need_cols):
    try:
        df_1hour_fe[col] = df_1hour_fe[col].astype(np.float32)
    except:
        print(col)

dump_pkl(df_1hour_fe[need_cols], 'experiments/feat_engin/data/data_shift_time_1hour.pkl')

100%|███████████████████████████████████████████████████████████████████████| 38/38 [00:00<00:00, 2199.97it/s]

time
ticker





In [267]:
(~df_1hour_fe.columns.isin(['ticker', 'result'])).sum(), df_1hour_fe[df_1hour_fe.columns[~df_1hour_fe.columns.isin(['ticker', 'result'])]].mean().nunique()


(42, 42)

In [270]:
#1day

In [275]:
df_1day_fe.columns

Index(['index_1day', 'time', 'open_1day', 'close_1day', 'volume_1day',
       'low_1day', 'high_1day', 'ticker', 'index_start_1day',
       'close_1day_w1_roc', 'close_1day_w1_diff', 'volume_1day_w1_roc',
       'volume_1day_w1_diff', 'close_1day_w5_ma', 'close_1day_w5_std',
       'close_1day_w5_min', 'close_1day_w5_max', 'close_1day_w5_rsi',
       'close_1day_w5_roc', 'close_1day_w5_diff', 'volume_1day_w5_ma',
       'volume_1day_w5_std', 'volume_1day_w5_min', 'volume_1day_w5_max',
       'volume_1day_w5_rsi', 'volume_1day_w5_roc', 'volume_1day_w5_diff',
       'close_1day_w20_ma', 'close_1day_w20_std', 'close_1day_w20_min',
       'close_1day_w20_max', 'close_1day_w20_rsi', 'close_1day_w20_roc',
       'close_1day_w20_diff', 'volume_1day_w20_ma', 'volume_1day_w20_std',
       'volume_1day_w20_min', 'volume_1day_w20_max', 'volume_1day_w20_rsi',
       'volume_1day_w20_roc', 'volume_1day_w20_diff', 'close_1day_w200_ma',
       'close_1day_w200_std', 'close_1day_w200_min', 'close_1day

In [277]:
need_cols = ['time',
             'sin_time_weekday', 'cos_time_weekday',
             'close_1day', 'volume_1day',

             'close_1day_w1_roc', 'close_1day_w1_diff', 'volume_1day_w1_roc',
       'volume_1day_w1_diff', 'close_1day_w5_ma', 'close_1day_w5_std',
       'close_1day_w5_min', 'close_1day_w5_max', 'close_1day_w5_rsi',
       'close_1day_w5_roc', 'close_1day_w5_diff', 'volume_1day_w5_ma',
       'volume_1day_w5_std', 'volume_1day_w5_min', 'volume_1day_w5_max',
       'volume_1day_w5_rsi', 'volume_1day_w5_roc', 'volume_1day_w5_diff',
       'close_1day_w20_ma', 'close_1day_w20_std', 'close_1day_w20_min',
       'close_1day_w20_max', 'close_1day_w20_rsi', 'close_1day_w20_roc',
       'close_1day_w20_diff', 'volume_1day_w20_ma', 'volume_1day_w20_std',
       'volume_1day_w20_min', 'volume_1day_w20_max', 'volume_1day_w20_rsi',
       'volume_1day_w20_roc', 'volume_1day_w20_diff', 'close_1day_w200_ma',
       'close_1day_w200_std', 'close_1day_w200_min', 'close_1day_w200_max',
       'close_1day_w200_rsi',
             #'close_1day_w200_roc', 'close_1day_w200_diff',
       'volume_1day_w200_ma', 'volume_1day_w200_std', 'volume_1day_w200_min',
       'volume_1day_w200_max', 'volume_1day_w200_rsi', 
             #'volume_1day_w200_roc', 'volume_1day_w200_diff',
             
             'ticker']

for col in tqdm(need_cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

dump_pkl(df_1day_fe[need_cols], 'experiments/feat_engin/data/data_shift_time_1day.pkl')

100%|███████████████████████████████████████████████████████████████████████| 48/48 [00:00<00:00, 3039.26it/s]

time
ticker





In [280]:
(~df_1day_fe.columns.isin(['ticker', 'result'])).sum(), df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'result'])]].mean().nunique()

(56, 56)

# TMP

In [229]:
df_fe[df_fe.columns[~df_fe.columns.isin(['ticker', 'time', 'result', 'delta_time'])]].abs().max().max()

38785459.0

In [231]:
df_5min_fe[df_5min_fe.columns[~df_5min_fe.columns.isin(['ticker', 'time'])]].abs().max().max()

3836758.0

In [233]:
df_1hour_fe[df_1hour_fe.columns[~df_1hour_fe.columns.isin(['ticker', 'time'])]].abs().max().max()

15106651.0

In [235]:
df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'time'])]].abs().max().max()

38785459.0

In [216]:
np.array([38785459]).astype(np.float32)

array([38785460.], dtype=float32)

In [218]:
df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'time'])]].max()

index_1day                          9723
open_1day                         8450.0
close_1day                        8444.0
volume_1day                   38785459.0
low_1day                          8315.0
high_1day                         8488.0
index_start_1day                    9181
close_1day_w1_roc               0.127022
close_1day_w1_diff                 406.0
volume_1day_w1_roc             30.210031
volume_1day_w1_diff           33635365.0
close_1day_w5_ma                  8354.5
close_1day_w5_std             524.285156
close_1day_w5_min                 8290.0
close_1day_w5_max                 8444.0
close_1day_w5_rsi                  100.0
close_1day_w5_roc               0.236323
close_1day_w5_diff                 921.5
volume_1day_w5_ma             24588842.0
volume_1day_w5_std       14241809.977249
volume_1day_w5_min            24588842.0
volume_1day_w5_max            38785459.0
volume_1day_w5_rsi                 100.0
volume_1day_w5_roc             36.823527
volume_1day_w5_d