In [1]:
#!pip3 install torch torchvision torchaudio

In [3]:
#!python3 -m pip install tensorflow

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

### 1. Load data

In [9]:

dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    

    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    




df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)


df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1hour.shape, df_1day.shape

100%|██████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 708.68it/s]


((972611, 8), (85512, 8))

In [10]:
df_1hour.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21


### 2. Preproc data

#### 2.1 Make target

In [16]:
# Используем уже знакомую функцию :)

def get_target(df_all, ind, val_first, val_second, points_to_wait):
    #цель - достичь val_second, не достигнув val_first

    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]

    start_price = df['close'].iloc[0]
    
    if val_first < val_second:                                 
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else: 
        mask_val_first = np.array(df['close'] > val_first)      ### тут будущему мне поразбираться нужно будет
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):

        ind_end = df.shape[0]-1
        delta_time = df['time'].iloc[ind_end] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_end]
        income_rate = res_price/start_price
        return 'DNF', delta_time, income_rate, res_price, ind_end+ind
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_second]
        income_rate = res_price/start_price
        return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_first]
        income_rate = res_price/start_price
        return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_first]
            income_rate = res_price/start_price
            return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_second]
            income_rate = res_price/start_price
            return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind


def get_df_target(df, indx, percent_first=None, percent_second=None, points_to_wait=None):
    times = []
    results = []
    delta_times = []
    income_rates = []
    closes = []
    tickers = []
    res_prices = []
    res_inds = []
    
    #for ind in indx:
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time, income_rate, res_price, res_ind = get_target(df, ind, val_first, val_second, points_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        income_rates += [income_rate]
        res_prices += [res_price]
        res_inds += [res_ind]
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times,
                              'income_rate' : income_rates,
                              'res_price' : res_prices,
                              'res_ind' : res_inds
                             })
    return df_result    

In [21]:
inds = np.arange(df_1hour.shape[0])
inds.shape

(972611,)

In [23]:
df_result = get_df_target(df_1hour, inds, percent_first=0.98, percent_second=1.04, points_to_wait=14*2)

  2%|█▎                                                              | 20787/972611 [00:04<03:35, 4415.96it/s]


KeyboardInterrupt: 

In [14]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.100,LOSE,AFLT,1 days 01:00:00,0.965109,30.98,10
1,1,2022-05-04 11:00:00,31.380,LOSE,AFLT,1 days 23:00:00,0.968133,30.38,18
2,2,2022-05-04 12:00:00,31.800,LOSE,AFLT,1 days 04:00:00,0.963522,30.64,15
3,3,2022-05-04 13:00:00,31.360,LOSE,AFLT,1 days 21:00:00,0.968750,30.38,18
4,4,2022-05-04 14:00:00,31.540,LOSE,AFLT,1 days 20:00:00,0.963221,30.38,18
...,...,...,...,...,...,...,...,...,...
972606,972606,2024-12-31 19:00:00,10.830,DNF,RBCM,0 days 04:00:00,0.994460,10.77,972610
972607,972607,2024-12-31 20:00:00,10.775,DNF,RBCM,0 days 03:00:00,0.999536,10.77,972610
972608,972608,2024-12-31 21:00:00,10.820,DNF,RBCM,0 days 02:00:00,0.995379,10.77,972610
972609,972609,2024-12-31 22:00:00,10.765,DNF,RBCM,0 days 01:00:00,1.000464,10.77,972610


In [25]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.400748
DNF     0.342301
WIN     0.256951
Name: proportion, dtype: float64

In [27]:
df_result['income_rate'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])

0.00    0.679149
0.01    0.932930
0.10    0.961252
0.20    0.966035
0.30    0.968447
0.40    0.969986
0.50    0.995260
0.60    1.009383
0.70    1.028608
0.80    1.052100
0.90    1.059642
0.99    1.123267
1.00    1.821375
Name: income_rate, dtype: float64

In [30]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [None]:
#тут

In [33]:
!mkdir data/feat_engin

In [37]:
dump_pkl(df_result, './data/feat_engin/df_result_wait_35_7days.pkl')

#### 2.1.2 Загрузим датасет с ожиданием в 1 день

In [3]:
df_result = load_pkl('./data/feat_engin/df_result_wait_35_7days.pkl')

In [5]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.100,LOSE,AFLT,1 days 01:00:00,0.965109,30.98,10
1,1,2022-05-04 11:00:00,31.380,LOSE,AFLT,1 days 23:00:00,0.968133,30.38,18
2,2,2022-05-04 12:00:00,31.800,LOSE,AFLT,1 days 04:00:00,0.963522,30.64,15
3,3,2022-05-04 13:00:00,31.360,LOSE,AFLT,1 days 21:00:00,0.968750,30.38,18
4,4,2022-05-04 14:00:00,31.540,LOSE,AFLT,1 days 20:00:00,0.963221,30.38,18
...,...,...,...,...,...,...,...,...,...
972606,972606,2024-12-31 19:00:00,10.830,DNF,RBCM,0 days 04:00:00,0.994460,10.77,972610
972607,972607,2024-12-31 20:00:00,10.775,DNF,RBCM,0 days 03:00:00,0.999536,10.77,972610
972608,972608,2024-12-31 21:00:00,10.820,DNF,RBCM,0 days 02:00:00,0.995379,10.77,972610
972609,972609,2024-12-31 22:00:00,10.765,DNF,RBCM,0 days 01:00:00,1.000464,10.77,972610


In [7]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.400748
DNF     0.342301
WIN     0.256951
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [12]:
dfs_1hour = []
dfs_1day = []

stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")


    df_1hour['index_start'] = None
    df_1hour.loc[0, 'index_start'] = 42
    df_1day['index_start'] = None
    df_1day.loc[0, 'index_start'] = 42

    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    

df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1hour.shape, df_1day.shape

100%|██████████████████████████████████████████████████████████████████████| 127/127 [00:00<00:00, 567.45it/s]


((972611, 9), (85512, 9))

In [14]:
pd.set_option('future.no_silent_downcasting', True)

#start index in each df-time

#1hour
mask = df_1hour['index_start'] == 42
df_1hour.loc[mask, 'index_start'] = df_1hour.index[mask]
df_1hour['index_start'] = df_1hour['index_start'].ffill()

#1day
mask = df_1day['index_start'] == 42
df_1day.loc[mask, 'index_start'] = df_1day.index[mask]
df_1day['index_start'] = df_1day['index_start'].ffill()

In [17]:
#link different time-step date

#послденюю точку каждого дня сделаем равной счече в 23:00 #каст костыльный, чтобы все удобно сджоинить
mask = (df_1hour['time'].dt.day.diff(-1) != 0)
orig_time_cp = df_1hour.loc[mask, 'time'].copy()
df_1hour.loc[mask, 'time'] = pd.to_datetime(df_1hour.loc[mask, 'time'].dt.date) + pd.Timedelta('23:00:00')


df_1day = df_1day.reset_index()
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['time', 'ticker']})
df_1day['time'] += pd.Timedelta('20:00:00')


df = df_1hour.merge(df_1day, on=['time', 'ticker'], how='left')

#Проверка, что данные все данные подтянулись
print(f'Пересесчение в данных 1hour и 1day: {df['close_1day'].notnull().sum() / df_1day.shape[0]}')

mask_diff = ((df['close_1day'].notnull()) & (df['close'] != df['close_1day']))
print(f'Error: close & close_1day, count diff values: {mask_diff.sum(), mask_diff.sum()/df['close_1day'].notnull().sum()}')

#Проверка что не наджоинилось лишнего
assert df_1hour.shape[0] == df.shape[0], 'Error: with join dimensions'


#вернем время обратно на место
df.loc[mask, 'time'] = orig_time_cp

Пересесчение в данных 1hour и 1day: 1.0
Error: close & close_1day, count diff values: (6854, 0.08015249321732622)


In [20]:
#действительно данные разняться от на дневкеи часовики. Берем в таких случаях данные часовика, как более точные
mask = df['close_1day'].notnull() & (df['close'] != df['close_1day'])
print(mask.sum())

df.loc[mask, "close_1day"] = df.loc[mask, "close"]

mask = df['close_1day'].notnull() & (df['close'] != df['close_1day'])
print('-->', mask.sum())


6854
--> 0


In [23]:
df.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,tmos_close_1day,index_start_1day
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,0,,,,,,,,
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,0,,,,,,,,
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,0,,,,,,,,
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,0,,,,,,,,
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,0,,,,,,,,


In [25]:
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    
    cols_ffill = ['index_start_1day', 'index_1day']
    df.loc[mask_stock, cols_ffill] = df.loc[mask_stock, cols_ffill].ffill()

100%|███████████████████████████████████████████████████████████████████████| 127/127 [00:03<00:00, 34.64it/s]


#### 2.4 Union target and features. Make data_file to train

In [30]:
(df_result['time'] == df['time']).all(), (df_result['close'] == df['close']).all()
#прикольно вышло

(True, True)

In [32]:
(df.index == df_result.index).all()

True

In [34]:
df_result.head()

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-05-04 10:00:00,32.1,LOSE,AFLT,1 days 01:00:00,0.965109,30.98,10
1,1,2022-05-04 11:00:00,31.38,LOSE,AFLT,1 days 23:00:00,0.968133,30.38,18
2,2,2022-05-04 12:00:00,31.8,LOSE,AFLT,1 days 04:00:00,0.963522,30.64,15
3,3,2022-05-04 13:00:00,31.36,LOSE,AFLT,1 days 21:00:00,0.96875,30.38,18
4,4,2022-05-04 14:00:00,31.54,LOSE,AFLT,1 days 20:00:00,0.963221,30.38,18


In [36]:
df_result.columns.tolist()

['ind',
 'time',
 'close',
 'result',
 'ticker',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind']

In [38]:
#union
df = pd.concat([df, df_result[['result', 'delta_time', 'income_rate', 'res_price', 'res_ind']]], axis=1)
df

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,volume_1day,low_1day,high_1day,tmos_close_1day,index_start_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,32.140,32.100,503380.0,31.460,32.680,AFLT,4.27,0,,...,,,,,,LOSE,1 days 01:00:00,0.965109,30.98,10
1,2022-05-04 11:00:00,32.100,31.380,191664.0,31.140,32.160,AFLT,4.20,0,,...,,,,,,LOSE,1 days 23:00:00,0.968133,30.38,18
2,2022-05-04 12:00:00,31.400,31.800,89256.0,31.380,31.920,AFLT,4.22,0,,...,,,,,,LOSE,1 days 04:00:00,0.963522,30.64,15
3,2022-05-04 13:00:00,31.800,31.360,87929.0,31.300,31.840,AFLT,4.21,0,,...,,,,,,LOSE,1 days 21:00:00,0.968750,30.38,18
4,2022-05-04 14:00:00,31.340,31.540,69671.0,31.140,31.580,AFLT,4.21,0,,...,,,,,,LOSE,1 days 20:00:00,0.963221,30.38,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972606,2024-12-31 19:00:00,10.820,10.830,6.0,10.775,10.830,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 04:00:00,0.994460,10.77,972610
972607,2024-12-31 20:00:00,10.775,10.775,1.0,10.775,10.775,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 03:00:00,0.999536,10.77,972610
972608,2024-12-31 21:00:00,10.820,10.820,10.0,10.820,10.820,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 02:00:00,0.995379,10.77,972610
972609,2024-12-31 22:00:00,10.785,10.765,59.0,10.765,10.785,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 01:00:00,1.000464,10.77,972610


### 2.5 Feature engineering

In [44]:
df

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,volume_1day,low_1day,high_1day,tmos_close_1day,index_start_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,32.140,32.100,503380.0,31.460,32.680,AFLT,4.27,0,,...,,,,,,LOSE,1 days 01:00:00,0.965109,30.98,10
1,2022-05-04 11:00:00,32.100,31.380,191664.0,31.140,32.160,AFLT,4.20,0,,...,,,,,,LOSE,1 days 23:00:00,0.968133,30.38,18
2,2022-05-04 12:00:00,31.400,31.800,89256.0,31.380,31.920,AFLT,4.22,0,,...,,,,,,LOSE,1 days 04:00:00,0.963522,30.64,15
3,2022-05-04 13:00:00,31.800,31.360,87929.0,31.300,31.840,AFLT,4.21,0,,...,,,,,,LOSE,1 days 21:00:00,0.968750,30.38,18
4,2022-05-04 14:00:00,31.340,31.540,69671.0,31.140,31.580,AFLT,4.21,0,,...,,,,,,LOSE,1 days 20:00:00,0.963221,30.38,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972606,2024-12-31 19:00:00,10.820,10.830,6.0,10.775,10.830,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 04:00:00,0.994460,10.77,972610
972607,2024-12-31 20:00:00,10.775,10.775,1.0,10.775,10.775,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 03:00:00,0.999536,10.77,972610
972608,2024-12-31 21:00:00,10.820,10.820,10.0,10.820,10.820,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 02:00:00,0.995379,10.77,972610
972609,2024-12-31 22:00:00,10.785,10.765,59.0,10.765,10.785,RBCM,6.27,965844,85511.0,...,,,,,84836,DNF,0 days 01:00:00,1.000464,10.77,972610


In [46]:
df_1hour.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,index_start
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,0
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,0
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,0
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,0
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,0


In [48]:
df_1day.head()

Unnamed: 0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,index_start_1day
0,0,2022-05-04 23:00:00,32.12,31.2,1138130.0,31.04,32.68,AFLT,4.18,0
1,1,2022-05-05 23:00:00,31.26,30.7,664101.0,30.4,31.74,AFLT,4.22,0
2,2,2022-05-06 23:00:00,30.8,30.1,306164.0,30.04,30.8,AFLT,4.19,0
3,3,2022-05-11 23:00:00,30.0,29.92,383481.0,29.62,30.54,AFLT,4.22,0
4,4,2022-05-12 23:00:00,29.92,29.32,315954.0,29.14,30.02,AFLT,4.06,0


In [51]:
def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=1).mean().values
    rolling_std = data.rolling(window=window, min_periods=1).std().values
    #upper_band = rolling_mean + (rolling_std * num_of_std)
    #lower_band = rolling_mean - (rolling_std * num_of_std)
    
    return rolling_mean, rolling_std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change."""
    roc = ((data - data.shift(periods)) / data.shift(periods))
    return roc.values



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std = calculate_bollinger_bands(data, window)

    # #min, max
    # rolling_min = data.rolling(window=window, min_periods=1).min().values
    # rolling_max = data.rolling(window=window, min_periods=1).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window)
    diff = data.diff(window).values

    #можно угол угла наклона добавить, чтобы определять фазы рынка
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        # f'{feat_name}_min' : rolling_min,
                        # f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        f'{feat_name}_diff' : diff,
                        })
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : data.pct_change(periods=1).values,
                        f'{feat_name}_diff' : data.diff(1).values,
                        })

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for level in levels:
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: ((x.values[-1] < x.values) & (x.values <= (1+level)*x.values[-1])).mean()).values]
        data_levels += [data.rolling(window=window, min_periods=1).apply(lambda x: (((1-level)*x.values[-1] <= x.values) & (x.values < x.values[-1])).mean()).values]

        column_names += [f"{feat_name}_lvl_{level}"]
        column_names += [f"{feat_name}_lvl_-{level}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))})
    return df_levels


In [53]:
df.head()

Unnamed: 0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,volume_1day,low_1day,high_1day,tmos_close_1day,index_start_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,0,,...,,,,,,LOSE,1 days 01:00:00,0.965109,30.98,10
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,0,,...,,,,,,LOSE,1 days 23:00:00,0.968133,30.38,18
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,0,,...,,,,,,LOSE,1 days 04:00:00,0.963522,30.64,15
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,0,,...,,,,,,LOSE,1 days 21:00:00,0.96875,30.38,18
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,0,,...,,,,,,LOSE,1 days 20:00:00,0.963221,30.38,18


#### 1hour

In [66]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    levels = [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1]

    #w1
    df_close_w1 = calc_stats_diff_1(df_ticker['close'], feat_name='close_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume'], feat_name='volume_w1')
    df_tmos_close_w1 = calc_stats_diff_1(df_ticker['tmos_close'], feat_name='tmos_close_w1')
    assert df_ticker.shape[0] == df_close_w1.shape[0] == df_volume_w1.shape[0] == df_tmos_close_w1.shape[0], 'Error w1'
    
    #w5
    df_close_w5 = calc_stats(df_ticker['close'], window=5, feat_name='close_w5')
    df_volume_w5 = calc_stats(df_ticker['volume'], window=5, feat_name='volume_w5')
    df_tmos_close_w5 = calc_stats(df_ticker['tmos_close'], window=5, feat_name='tmos_close_w5')
    assert df_ticker.shape[0] == df_close_w5.shape[0] == df_volume_w5.shape[0] == df_tmos_close_w5.shape[0], 'Error w5'
    
    #w14
    df_close_w14 = calc_stats(df_ticker['close'], window=14, feat_name='close_w14')
    df_volume_w14 = calc_stats(df_ticker['volume'], window=14, feat_name='volume_w14')
    df_tmos_close_w14 = calc_stats(df_ticker['tmos_close'], window=14, feat_name='tmos_close_w14')
    assert df_ticker.shape[0] == df_close_w14.shape[0] == df_volume_w14.shape[0] == df_tmos_close_w14.shape[0], 'Error w14'

    #w70=14*5
    df_close_w70 = calc_stats(df_ticker['close'], window=70, feat_name='close_w70')
    df_close_levels_w70 = calc_levels(df_ticker['close'], window=70, levels=levels, feat_name="close_w70")
    df_volume_w70 = calc_stats(df_ticker['volume'], window=70, feat_name='volume_w70')
    df_tmos_close_w70 = calc_stats(df_ticker['tmos_close'], window=70, feat_name='tmos_close_w70')
    df_tmos_close_levels_w70 = calc_levels(df_ticker['tmos_close'], window=70, levels=levels, feat_name='tmos_close_w70')
    assert df_ticker.shape[0] == df_close_w70.shape[0] == df_close_levels_w70.shape[0] == df_volume_w70.shape[0] == df_tmos_close_w70.shape[0] == df_tmos_close_levels_w70.shape[0], 'Error w70'


    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1, df_tmos_close_w1,
                       df_close_w5, df_volume_w5, df_tmos_close_w5,
                       df_close_w14, df_volume_w14, df_tmos_close_w14,
                       df_close_w70, df_close_levels_w70, df_volume_w70, df_tmos_close_w70, df_tmos_close_levels_w70
                      ], axis=1)]
    assert (df_ticker.shape[0] == dfs[-1].shape[0]) and (dfs[-1].shape[1] == (df_ticker.shape[1]+3*df_close_w1.shape[1]+9*df_close_w5.shape[1]+2*df_close_levels_w70.shape[1])), 'Error concat'



df_fe = pd.concat(dfs).set_index('index')


100%|███████████████████████████████████████████████████████████████████████| 127/127 [03:56<00:00,  1.86s/it]


In [68]:
df_fe.head()

Unnamed: 0_level_0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,tmos_close_w70_lvl_0.03,tmos_close_w70_lvl_-0.03,tmos_close_w70_lvl_0.04,tmos_close_w70_lvl_-0.04,tmos_close_w70_lvl_0.05,tmos_close_w70_lvl_-0.05,tmos_close_w70_lvl_0.07,tmos_close_w70_lvl_-0.07,tmos_close_w70_lvl_0.1,tmos_close_w70_lvl_-0.1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,0,,...,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,0,,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,0,,...,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,0,,...,0.4,0.2,0.4,0.2,0.4,0.2,0.4,0.2,0.4,0.2


In [79]:
(df_fe['close'] == df['close']).all()

True

#### 1day

In [83]:
df_1day

Unnamed: 0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,index_start_1day
0,0,2022-05-04 23:00:00,32.120,31.200,1138130.0,31.040,32.680,AFLT,4.18,0
1,1,2022-05-05 23:00:00,31.260,30.700,664101.0,30.400,31.740,AFLT,4.22,0
2,2,2022-05-06 23:00:00,30.800,30.100,306164.0,30.040,30.800,AFLT,4.19,0
3,3,2022-05-11 23:00:00,30.000,29.920,383481.0,29.620,30.540,AFLT,4.22,0
4,4,2022-05-12 23:00:00,29.920,29.320,315954.0,29.140,30.020,AFLT,4.06,0
...,...,...,...,...,...,...,...,...,...,...
85507,85507,2024-12-24 23:00:00,10.240,10.090,9990.0,9.980,10.400,RBCM,5.86,84836
85508,85508,2024-12-25 23:00:00,10.050,10.125,11761.0,10.030,10.365,RBCM,5.98,84836
85509,85509,2024-12-26 23:00:00,10.360,10.320,20428.0,10.230,10.565,RBCM,5.97,84836
85510,85510,2024-12-27 23:00:00,10.255,10.205,9970.0,10.185,10.380,RBCM,6.01,84836


In [85]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    levels = [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1]

    #w1
    df_close_w1 = calc_stats_diff_1(df_ticker['close_1day'], feat_name='close_1day_w1')
    df_volume_w1 = calc_stats_diff_1(df_ticker['volume_1day'], feat_name='volume_1day_w1')
    df_tmos_close_w1 = calc_stats_diff_1(df_ticker['tmos_close_1day'], feat_name='tmos_close_1day_w1')
    assert df_ticker.shape[0] == df_close_w1.shape[0] == df_volume_w1.shape[0] == df_tmos_close_w1.shape[0], 'Error w1'
    
    #w3
    df_close_w3 = calc_stats(df_ticker['close_1day'], window=3, feat_name='close_1day_w3')
    df_volume_w3 = calc_stats(df_ticker['volume_1day'], window=3, feat_name='volume_1day_w3')
    df_tmos_close_w3 = calc_stats(df_ticker['tmos_close_1day'], window=3, feat_name='tmos_close_1day_w3')
    assert df_ticker.shape[0] == df_close_w3.shape[0] == df_volume_w3.shape[0] == df_tmos_close_w3.shape[0], 'Error w3'
    
    #w5
    df_close_w5 = calc_stats(df_ticker['close_1day'], window=5, feat_name='close_1day_w5')
    df_volume_w5 = calc_stats(df_ticker['volume_1day'], window=5, feat_name='volume_1day_w5')
    df_tmos_close_w5 = calc_stats(df_ticker['tmos_close_1day'], window=5, feat_name='tmos_close_1day_w5')
    assert df_ticker.shape[0] == df_close_w5.shape[0] == df_volume_w5.shape[0] == df_tmos_close_w5.shape[0], 'Error w5'
    
    #w20
    df_close_w20 = calc_stats(df_ticker['close_1day'], window=5*4, feat_name='close_1day_w20')
    df_close_levels_w20 = calc_levels(df_ticker['close_1day'], window=5*4, levels=levels, feat_name='close_1day_w20')
    df_volume_w20 = calc_stats(df_ticker['volume_1day'], window=5*4, feat_name='volume_1day_w20')
    df_tmos_close_w20 = calc_stats(df_ticker['tmos_close_1day'], window=5*4, feat_name='tmos_close_1day_w20')
    df_tmos_close_levels_w20 = calc_levels(df_ticker['tmos_close_1day'], window=5*4, levels=levels, feat_name='tmos_close_1day_w20')
    assert df_ticker.shape[0] == df_close_w20.shape[0] == df_close_levels_w20.shape[0] == df_volume_w20.shape[0] == df_tmos_close_w20.shape[0] == df_tmos_close_levels_w20.shape[0], 'Error w20'
    
    
    #w100
    df_close_w100 = calc_stats(df_ticker['close_1day'], window=100, feat_name='close_1day_w100')
    df_close_levels_w100 = calc_levels(df_ticker['close_1day'], window=100, levels=levels, feat_name='close_1day_w100')
    df_volume_w100 = calc_stats(df_ticker['volume_1day'], window=100, feat_name='volume_1day_w100')
    df_tmos_close_w100 = calc_stats(df_ticker['tmos_close_1day'], window=100, feat_name='tmos_close_1day_w100')
    df_tmos_close_levels_w100 = calc_levels(df_ticker['tmos_close_1day'], window=100, levels=levels, feat_name='tmos_close_1day_w100')
    assert df_ticker.shape[0] == df_close_w100.shape[0] == df_close_levels_w100.shape[0] == df_volume_w100.shape[0] == df_tmos_close_w100.shape[0] == df_tmos_close_levels_w100.shape[0], 'Error w100'
    
    dfs += [pd.concat([df_ticker,
                       df_close_w1, df_volume_w1, df_tmos_close_w1, 
                       df_close_w3, df_volume_w3, df_tmos_close_w3, 
                       df_close_w5, df_volume_w5, df_tmos_close_w5,
                       df_close_w20, df_close_levels_w20, df_volume_w20, df_tmos_close_w20, df_tmos_close_levels_w20,
                       df_close_w100, df_close_levels_w100, df_volume_w100, df_tmos_close_w100, df_tmos_close_levels_w100], axis=1)]
    
    assert (df_ticker.shape[0] == dfs[-1].shape[0]) and (dfs[-1].shape[1] == (df_ticker.shape[1]+3*df_close_w1.shape[1]+12*df_close_w5.shape[1]+4*df_close_levels_w20.shape[1])), 'Error concat'



df_1day_fe = pd.concat(dfs).set_index('index')


100%|███████████████████████████████████████████████████████████████████████| 127/127 [00:42<00:00,  3.00it/s]


In [87]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,index_start_1day,...,tmos_close_1day_w100_lvl_0.03,tmos_close_1day_w100_lvl_-0.03,tmos_close_1day_w100_lvl_0.04,tmos_close_1day_w100_lvl_-0.04,tmos_close_1day_w100_lvl_0.05,tmos_close_1day_w100_lvl_-0.05,tmos_close_1day_w100_lvl_0.07,tmos_close_1day_w100_lvl_-0.07,tmos_close_1day_w100_lvl_0.1,tmos_close_1day_w100_lvl_-0.1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-05-04 23:00:00,32.12,31.2,1138130.0,31.04,32.68,AFLT,4.18,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2022-05-05 23:00:00,31.26,30.7,664101.0,30.4,31.74,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5
2,2,2022-05-06 23:00:00,30.8,30.1,306164.0,30.04,30.8,AFLT,4.19,0,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,3,2022-05-11 23:00:00,30.0,29.92,383481.0,29.62,30.54,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5
4,4,2022-05-12 23:00:00,29.92,29.32,315954.0,29.14,30.02,AFLT,4.06,0,...,0.2,0.0,0.8,0.0,0.8,0.0,0.8,0.0,0.8,0.0


In [90]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index == df_1day_fe['index_1day']).all()

(True, True)

### time features

#### 1hour

In [97]:
df_fe.iloc[:15]

Unnamed: 0_level_0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,tmos_close_w70_lvl_0.03,tmos_close_w70_lvl_-0.03,tmos_close_w70_lvl_0.04,tmos_close_w70_lvl_-0.04,tmos_close_w70_lvl_0.05,tmos_close_w70_lvl_-0.05,tmos_close_w70_lvl_0.07,tmos_close_w70_lvl_-0.07,tmos_close_w70_lvl_0.1,tmos_close_w70_lvl_-0.1
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-05-04 10:00:00,32.14,32.1,503380.0,31.46,32.68,AFLT,4.27,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-05-04 11:00:00,32.1,31.38,191664.0,31.14,32.16,AFLT,4.2,0,,...,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,0.0
2,2022-05-04 12:00:00,31.4,31.8,89256.0,31.38,31.92,AFLT,4.22,0,,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,2022-05-04 13:00:00,31.8,31.36,87929.0,31.3,31.84,AFLT,4.21,0,,...,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25
4,2022-05-04 14:00:00,31.34,31.54,69671.0,31.14,31.58,AFLT,4.21,0,,...,0.4,0.2,0.4,0.2,0.4,0.2,0.4,0.2,0.4,0.2
5,2022-05-04 15:00:00,31.52,31.36,34953.0,31.22,31.54,AFLT,4.22,0,,...,0.166667,0.5,0.166667,0.5,0.166667,0.5,0.166667,0.5,0.166667,0.5
6,2022-05-04 16:00:00,31.32,31.16,62035.0,31.06,31.38,AFLT,4.21,0,,...,0.428571,0.142857,0.428571,0.142857,0.428571,0.142857,0.428571,0.142857,0.428571,0.142857
7,2022-05-04 17:00:00,31.18,31.22,49352.0,31.04,31.46,AFLT,4.21,0,,...,0.375,0.125,0.375,0.125,0.375,0.125,0.375,0.125,0.375,0.125
8,2022-05-04 18:00:00,31.32,31.2,43918.0,31.16,31.44,AFLT,4.18,0,0.0,...,0.888889,0.0,0.888889,0.0,0.888889,0.0,0.888889,0.0,0.888889,0.0
9,2022-05-05 10:00:00,31.26,31.24,124578.0,31.18,31.74,AFLT,4.22,0,0.0,...,0.1,0.6,0.1,0.6,0.1,0.6,0.1,0.6,0.1,0.6


In [99]:
time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

df_fe

Unnamed: 0_level_0,time,open,close,volume,low,high,ticker,tmos_close,index_start,index_1day,...,tmos_close_w70_lvl_0.04,tmos_close_w70_lvl_-0.04,tmos_close_w70_lvl_0.05,tmos_close_w70_lvl_-0.05,tmos_close_w70_lvl_0.07,tmos_close_w70_lvl_-0.07,tmos_close_w70_lvl_0.1,tmos_close_w70_lvl_-0.1,sin_time_hour,cos_time_hour
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-05-04 10:00:00,32.140,32.100,503380.0,31.460,32.680,AFLT,4.27,0,,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,1.000000
1,2022-05-04 11:00:00,32.100,31.380,191664.0,31.140,32.160,AFLT,4.20,0,,...,0.500000,0.000000,0.500000,0.000000,0.500000,0.000000,0.500000,0.000000,4.647232e-01,0.885456
2,2022-05-04 12:00:00,31.400,31.800,89256.0,31.380,31.920,AFLT,4.22,0,,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,8.229839e-01,0.568065
3,2022-05-04 13:00:00,31.800,31.360,87929.0,31.300,31.840,AFLT,4.21,0,,...,0.500000,0.250000,0.500000,0.250000,0.500000,0.250000,0.500000,0.250000,9.927089e-01,0.120537
4,2022-05-04 14:00:00,31.340,31.540,69671.0,31.140,31.580,AFLT,4.21,0,,...,0.400000,0.200000,0.400000,0.200000,0.400000,0.200000,0.400000,0.200000,9.350162e-01,-0.354605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972606,2024-12-31 19:00:00,10.820,10.830,6.0,10.775,10.830,RBCM,6.27,965844,85511.0,...,0.000000,0.185714,0.000000,0.671429,0.000000,0.828571,0.000000,0.842857,-9.350162e-01,-0.354605
972607,2024-12-31 20:00:00,10.775,10.775,1.0,10.775,10.775,RBCM,6.27,965844,85511.0,...,0.000000,0.185714,0.000000,0.671429,0.000000,0.814286,0.000000,0.828571,-9.927089e-01,0.120537
972608,2024-12-31 21:00:00,10.820,10.820,10.0,10.820,10.820,RBCM,6.27,965844,85511.0,...,0.000000,0.185714,0.000000,0.671429,0.000000,0.800000,0.000000,0.814286,-8.229839e-01,0.568065
972609,2024-12-31 22:00:00,10.785,10.765,59.0,10.765,10.785,RBCM,6.27,965844,85511.0,...,0.000000,0.185714,0.000000,0.671429,0.000000,0.785714,0.000000,0.800000,-4.647232e-01,0.885456


#### 1day

In [102]:
#day of week

day_of_week_cyclic = np.minimum(df_1day_fe['time'].dt.dayofweek, 4) / 4
df_1day_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
df_1day_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,index_start_1day,...,tmos_close_1day_w100_lvl_0.04,tmos_close_1day_w100_lvl_-0.04,tmos_close_1day_w100_lvl_0.05,tmos_close_1day_w100_lvl_-0.05,tmos_close_1day_w100_lvl_0.07,tmos_close_1day_w100_lvl_-0.07,tmos_close_1day_w100_lvl_0.1,tmos_close_1day_w100_lvl_-0.1,sin_time_weekday,cos_time_weekday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-05-04 23:00:00,32.12,31.2,1138130.0,31.04,32.68,AFLT,4.18,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.224647e-16,-1.0
1,1,2022-05-05 23:00:00,31.26,30.7,664101.0,30.4,31.74,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,-1.0,-1.83697e-16
2,2,2022-05-06 23:00:00,30.8,30.1,306164.0,30.04,30.8,AFLT,4.19,0,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,-2.449294e-16,1.0
3,3,2022-05-11 23:00:00,30.0,29.92,383481.0,29.62,30.54,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,0.0,0.5,1.224647e-16,-1.0
4,4,2022-05-12 23:00:00,29.92,29.32,315954.0,29.14,30.02,AFLT,4.06,0,...,0.8,0.0,0.8,0.0,0.8,0.0,0.8,0.0,-1.0,-1.83697e-16


In [106]:
#day of month
day_of_month_cyclic = df_1day_fe['time'].dt.day / 31
df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)

df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,tmos_close_1day,index_start_1day,...,tmos_close_1day_w100_lvl_0.05,tmos_close_1day_w100_lvl_-0.05,tmos_close_1day_w100_lvl_0.07,tmos_close_1day_w100_lvl_-0.07,tmos_close_1day_w100_lvl_0.1,tmos_close_1day_w100_lvl_-0.1,sin_time_weekday,cos_time_weekday,sin_time_monthday,cos_time_monthday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-05-04 23:00:00,32.12,31.2,1138130.0,31.04,32.68,AFLT,4.18,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.224647e-16,-1.0,0.724793,0.688967
1,1,2022-05-05 23:00:00,31.26,30.7,664101.0,30.4,31.74,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,-1.0,-1.83697e-16,0.848644,0.528964
2,2,2022-05-06 23:00:00,30.8,30.1,306164.0,30.04,30.8,AFLT,4.19,0,...,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,-2.449294e-16,1.0,0.937752,0.347305
3,3,2022-05-11 23:00:00,30.0,29.92,383481.0,29.62,30.54,AFLT,4.22,0,...,0.0,0.5,0.0,0.5,0.0,0.5,1.224647e-16,-1.0,0.790776,-0.612106
4,4,2022-05-12 23:00:00,29.92,29.32,315954.0,29.14,30.02,AFLT,4.06,0,...,0.8,0.0,0.8,0.0,0.8,0.0,-1.0,-1.83697e-16,0.651372,-0.758758


## 2.6 Save data

In [128]:
df_fe.columns.tolist()

['time',
 'open',
 'close',
 'volume',
 'low',
 'high',
 'ticker',
 'tmos_close',
 'index_start',
 'index_1day',
 'open_1day',
 'close_1day',
 'volume_1day',
 'low_1day',
 'high_1day',
 'tmos_close_1day',
 'index_start_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'close_w1_diff',
 'volume_w1_roc',
 'volume_w1_diff',
 'tmos_close_w1_roc',
 'tmos_close_w1_diff',
 'close_w5_ma',
 'close_w5_std',
 'close_w5_rsi',
 'close_w5_roc',
 'close_w5_diff',
 'volume_w5_ma',
 'volume_w5_std',
 'volume_w5_rsi',
 'volume_w5_roc',
 'volume_w5_diff',
 'tmos_close_w5_ma',
 'tmos_close_w5_std',
 'tmos_close_w5_rsi',
 'tmos_close_w5_roc',
 'tmos_close_w5_diff',
 'close_w14_ma',
 'close_w14_std',
 'close_w14_rsi',
 'close_w14_roc',
 'close_w14_diff',
 'volume_w14_ma',
 'volume_w14_std',
 'volume_w14_rsi',
 'volume_w14_roc',
 'volume_w14_diff',
 'tmos_close_w14_ma',
 'tmos_close_w14_std',
 'tmos_close_w14_rsi',
 'tmos_close_w14_roc',
 'tmos_close_w14_diff',
 'cl

In [130]:
!mkdir data/feat_engin/data

In [134]:
cols = df_fe.columns[~df_fe.columns.isin(['open', 'low', 'high', 
                    'open_1day', 'close_1day', 'volume_1day', 'low_1day', 'high_1day', 'tmos_close_1day',
                   ])]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)

#save
dump_pkl(df_fe[cols], 'data/feat_engin/data/data_1hour.pkl')

100%|████████████████████████████████████████████████████████████████████████| 94/94 [00:00<00:00, 846.83it/s]


time
ticker
result
delta_time


In [136]:
#проверка уникальности данных в колонках
(~df_fe.columns.isin(['ticker', 'result'])).sum(), df_fe[df_fe.columns[~df_fe.columns.isin(['ticker', 'result'])]].mean().nunique()


(101, 101)

In [105]:
#1day

In [140]:
df_1day_fe.columns.tolist()

['index_1day',
 'time',
 'open_1day',
 'close_1day',
 'volume_1day',
 'low_1day',
 'high_1day',
 'ticker',
 'tmos_close_1day',
 'index_start_1day',
 'close_1day_w1_roc',
 'close_1day_w1_diff',
 'volume_1day_w1_roc',
 'volume_1day_w1_diff',
 'tmos_close_1day_w1_roc',
 'tmos_close_1day_w1_diff',
 'close_1day_w3_ma',
 'close_1day_w3_std',
 'close_1day_w3_rsi',
 'close_1day_w3_roc',
 'close_1day_w3_diff',
 'volume_1day_w3_ma',
 'volume_1day_w3_std',
 'volume_1day_w3_rsi',
 'volume_1day_w3_roc',
 'volume_1day_w3_diff',
 'tmos_close_1day_w3_ma',
 'tmos_close_1day_w3_std',
 'tmos_close_1day_w3_rsi',
 'tmos_close_1day_w3_roc',
 'tmos_close_1day_w3_diff',
 'close_1day_w5_ma',
 'close_1day_w5_std',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'close_1day_w5_diff',
 'volume_1day_w5_ma',
 'volume_1day_w5_std',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'volume_1day_w5_diff',
 'tmos_close_1day_w5_ma',
 'tmos_close_1day_w5_std',
 'tmos_close_1day_w5_rsi',
 'tmos_close_1day_w5_roc',
 'tmos_close_

In [142]:
cols = df_1day_fe.columns[~df_1day_fe.columns.isin(['index_1day', 'open_1day', 'low_1day', 'high_1day', 'index_start_1day',
                                                    # 'close_1day_w100_roc', 'close_1day_w100_diff',
                                                    # 'volume_1day_w100_roc', 'volume_1day_100_diff',
                   ])]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

dump_pkl(df_1day_fe[cols], 'data/feat_engin/data/data_1day.pkl')

100%|█████████████████████████████████████████████████████████████████████| 131/131 [00:00<00:00, 2234.77it/s]

time
ticker





In [145]:
#проверка уникальности данных в колонках
(~df_1day_fe.columns.isin(['ticker', 'result'])).sum(), df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'result'])]].mean().nunique()

(135, 135)

# TMP

In [116]:
df_fe[df_fe.columns[~df_fe.columns.isin(['ticker', 'time', 'result', 'delta_time'])]].abs().max().max()

162785051.0

In [118]:
df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'time'])]].abs().max().max()

162785051.0

In [122]:
np.array([162785051]).astype(np.float32)

array([1.6278506e+08], dtype=float32)

In [124]:
df_1day_fe[df_1day_fe.columns[~df_1day_fe.columns.isin(['ticker', 'time'])]].max()

index_1day                             85511
open_1day                            62340.0
close_1day                           64000.0
volume_1day                      162785051.0
low_1day                             60400.0
                                    ...     
tmos_close_1day_w100_lvl_-0.1           0.99
sin_time_weekday                         1.0
cos_time_weekday                         1.0
sin_time_monthday                   0.998717
cos_time_monthday                        1.0
Length: 134, dtype: object