In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [2]:
stocks = ['ABIO',
 'AFKS',
 'AFLT',
 'ALRS',
 'APTK',
 'AQUA',
 'BANE',
 'BANEP',
 'BELU',
 'BSPB',
 'CBOM',
 'CHMF',
 'ENPG',
 'FEES',
 'FESH',
 'FLOT',
 'GAZP',
 'GMKN',
 'GTRK',
 'HEAD',
 'HYDR',
 'IRAO',
 'IRKT',
 'KMAZ',
 'LENT',
 'LIFE',
 'LKOH',
 'LSRG',
 'MAGN',
 'MDMG',
 'MGNT',
 'MOEX',
 'MTLR',
 'MTLRP',
 'MTSS',
 'MVID',
 'NLMK',
 'NMTP',
 'NVTK',
 'OGKB',
 'PHOR',
 'PIKK',
 'PLZL',
 'POSI',
 'RASP',
 'RENI',
 'RNFT',
 'ROSN',
 'RTKM',
 'RTKMP',
 'RUAL',
 'SBER',
 'SBERP',
 'SELG',
 'SFIN',
 'SGZH',
 'SIBN',
 'SMLT',
 'SNGS',
 'SNGSP',
 'SPBE',
 'SVAV',
 'T',
 'TATN',
 'TATNP',
 'TGKN',
 'TRMK',
 'TRNFP',
 'UNAC',
 'UPRO',
 'UWGN',
 'VKCO',
 'VSMO',
 'VTBR',
 'WUSH',
 'YDEX',


#'TMOS',
]

len(stocks)

76

#### 2.2 Link data of different time-period

In [9]:
dfs_1hour = []
dfs_1day = []

#stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")

    
    #time_index 1hour
    df_1hour['date'] = df_1hour['time'].dt.date

    df_join_time = pd.DataFrame({'date' : df_1hour['time'].dt.date.drop_duplicates(keep='first')})
    df_join_time['date_day_index'] = df_join_time['date'].shift(1)

    
    #time_index 1day
    df_1day['date_day_index'] = df_1day['time'].dt.date
    

    df_1hour = df_1hour.merge(df_join_time, how='left', on='date')
    

    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    
    

df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)


df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

#date_day_index
#в конце свечи 23:00 уже известна инфа о конце дня
mask = df_1hour['time'].dt.hour == 23
df_1hour.loc[mask, 'date_day_index'] = df_1hour.loc[mask, 'time'].dt.date
#спец. корректировка для 18:00 (акции без вечерних торгов)
mask = (df_1hour['time'].dt.hour == 18) & (df_1hour['time'].dt.day.diff(-1) != 0)
df_1hour.loc[mask, 'date_day_index'] = df_1hour.loc[mask, 'time'].dt.date


df_1hour.shape, df_1day.shape

100%|████████████████████████████████████████████████████████████████████████| 76/76 [00:00<00:00, 218.13it/s]


((693950, 7), (50955, 6))

In [12]:
#проверка глазами
i = 123542 + 12*6
df_1hour.iloc[i:i+20]

Unnamed: 0,time,close,volume,ticker,tmos_close,date,date_day_index
123614,2024-03-20 23:00:00,0.11922,2591.0,FEES,6.66,2024-03-20,2024-03-20
123615,2024-03-21 09:00:00,0.11958,2.0,FEES,6.66,2024-03-21,2024-03-20
123616,2024-03-21 10:00:00,0.11978,37812.0,FEES,6.66,2024-03-21,2024-03-20
123617,2024-03-21 11:00:00,0.11938,30362.0,FEES,6.66,2024-03-21,2024-03-20
123618,2024-03-21 12:00:00,0.11864,22250.0,FEES,6.66,2024-03-21,2024-03-20
123619,2024-03-21 13:00:00,0.11856,9717.0,FEES,6.66,2024-03-21,2024-03-20
123620,2024-03-21 14:00:00,0.1182,35978.0,FEES,6.66,2024-03-21,2024-03-20
123621,2024-03-21 15:00:00,0.11712,75279.0,FEES,6.63,2024-03-21,2024-03-20
123622,2024-03-21 16:00:00,0.11684,26337.0,FEES,6.64,2024-03-21,2024-03-20
123623,2024-03-21 17:00:00,0.1168,90750.0,FEES,6.64,2024-03-21,2024-03-20


In [14]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,date_day_index
0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30
1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01
2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92,2022-07-04
3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05
4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06


In [18]:
#link different time-step date

df_1day.reset_index(inplace=True)
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['date_day_index', 'ticker']})
df = df_1hour.merge(df_1day, on=['date_day_index', 'ticker'], how='left')



#Проверка что не наджоинилось лишнего
assert df_1hour.shape[0] == df.shape[0], 'Error: with join dimensions'

In [20]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,date,date_day_index,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.00,2022-07-01,,,NaT,,,
1,2022-07-01 10:00:00,56.40,13083.0,ABIO,3.93,2022-07-01,,,NaT,,,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,2022-07-01,,,NaT,,,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,2022-07-01,,,NaT,,,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,2022-07-01,,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
693945,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693946,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693947,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693948,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21


In [24]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): {df.loc[df['index_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): 2003  акций-дней


In [26]:
df['index_1day'].isnull().mean()

0.022252323654441963

In [28]:
df['ticker'].nunique()

76

In [30]:
2003 / 76 # в среднем на акцию пропущено дней

26.355263157894736

In [34]:
#Прикол в том, что в эти праздничные дни 5-мин, 1-час данные есть, а 1-дневных нет
df.loc[df['index_1day'].isnull(), 'time'].dt.date.value_counts()

time
2025-01-02    1106
2024-11-05    1036
2025-01-03    1035
2025-01-08    1035
2024-05-10    1004
2024-06-13     997
2024-05-02     995
2024-03-11     822
2024-02-26     821
2024-01-03     777
2023-06-13     679
2022-07-01     668
2023-03-09     646
2023-05-02     633
2023-05-10     619
2023-02-24     610
2023-01-03     606
2022-11-07     450
2024-05-09      75
2025-01-07      73
2024-06-12      73
2024-05-01      73
2024-12-31      72
2024-11-04      70
2024-03-08      63
2024-02-23      62
2023-06-12      50
2023-03-08      48
2023-02-23      47
2023-05-01      47
2023-05-09      46
2023-01-02      46
2022-11-04      35
2024-01-02      20
2022-12-14       3
Name: count, dtype: int64

In [37]:
#Это происходит из-за пропущенных дней в df_1day (праздников), поэтому протянем их ffill
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    cols_ffil = ['index_1day']
    df.loc[mask_stock, cols_ffil] = df.loc[mask_stock, cols_ffil].ffill()

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [00:01<00:00, 58.40it/s]


In [38]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): {df.loc[df['index_1day'].isnull(), ['ticker', 'date']].groupby(['ticker', 'date']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1day) к 1hour (от 1day): 76  акций-дней


In [43]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,date,date_day_index,index_1day,time_1day,close_1day,volume_1day,tmos_close_1day
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.00,2022-07-01,,,NaT,,,
1,2022-07-01 10:00:00,56.40,13083.0,ABIO,3.93,2022-07-01,,,NaT,,,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,2022-07-01,,,NaT,,,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,2022-07-01,,,NaT,,,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,2022-07-01,,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
693945,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693946,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693947,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21
693948,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,2025-02-24,2025-02-21,50953.0,2025-02-21 03:00:00,4667.0,685047.0,7.21


In [46]:
df = df[['time', 'close', 'volume', 'ticker', 'tmos_close', 'index_1day']]
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.00,
1,2022-07-01 10:00:00,56.40,13083.0,ABIO,3.93,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,
...,...,...,...,...,...,...
693945,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,50953.0
693946,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,50953.0
693947,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,50953.0
693948,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,50953.0


### 2.5 Feature engineering

In [55]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.00,
1,2022-07-01 10:00:00,56.40,13083.0,ABIO,3.93,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,
...,...,...,...,...,...,...
693945,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,50953.0
693946,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,50953.0
693947,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,50953.0
693948,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,50953.0


In [58]:
df_1day.head()

Unnamed: 0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index
0,0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30
1,1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01
2,2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92,2022-07-04
3,3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05
4,4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06


In [74]:
from sklearn.linear_model import LinearRegression

# def calculate_exp_ma(data, window):
#     alpha = 2 / (window + 1)
#     coeffs = ((1 - alpha)**(np.arange(window)[::-1])) * (alpha)
#     coeffs[0] /= alpha
    
#     return data.rolling(window=window, min_periods=window).apply(lambda x: (x*coeffs).sum()).values


def calculate_bollinger_bands(data, window):
    #exp_ma
    alpha = 2 / (window + 1)
    coeffs = ((1 - alpha)**(np.arange(window)[::-1])) * (alpha)
    coeffs[0] /= alpha
    exp_ma = data.rolling(window=window, min_periods=window).apply(lambda x: (x*coeffs).sum()).values

    """Calculate Bollinger Bands"""
    rolling_std = data.rolling(window=window, min_periods=window).std().values
    # norm_rolling_std = rolling_std / (exp_ma + np.finfo(np.float64).eps)

    num_of_std = 2
    lower_band_2std = exp_ma - (rolling_std * num_of_std)
    upper_band_2std = exp_ma + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = exp_ma - (rolling_std * num_of_std)
    upper_band_3std = exp_ma + (rolling_std * num_of_std)
    
    
    return exp_ma, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

# def calculate_roc(data, periods):
#     """Calculate Rate of Change"""
#     roc = (data - data.shift(periods)) / (data.shift(periods)+np.finfo(np.float32).eps)
#     return roc



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    exp_ma, \
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    # #mean_abs_pct
    # mean_abs_pct = calculate_roc(data, 1).rolling(window=window, min_periods=window).apply(lambda x: x.abs().mean()).values
        
    # #alpha
    alpha = data.rolling(window=window, min_periods=window).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0]).values

    #min, max
    rolling_min = data.rolling(window=window, min_periods=window).min().values
    rolling_max = data.rolling(window=window, min_periods=window).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    # #roc
    # roc = calculate_roc(data, window).values
    # # diff = data.diff(window).values

    # #exp_ma
    # exp_ma = calculate_exp_ma(data, window)
    
    df_features = pd.DataFrame({f'{feat_name}_expma' : exp_ma,
                        # f'{feat_name}_std' : rolling_std,
                        # f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_expma_low_2std' : lower_band_2std,
                        f'{feat_name}_expma_up_2std' : upper_band_2std,
                        f'{feat_name}_expma_low_3std' : lower_band_3std,
                        f'{feat_name}_expma_up_3std' : upper_band_3std, 

                        # f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        # f'{feat_name}_roc' : roc,
                        # f'{feat_name}_diff' : diff,
                        # f'{feat_name}_expma' : exp_ma,
                        })
    return df_features


# def calc_stats_diff_1(data, feat_name=None):
#     return pd.DataFrame({f'{feat_name}_roc' : calculate_roc(data, 1).values,
#                         # f'{feat_name}_diff' : data.diff(1).values,
#                         }).astype(np.float32)

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))}).astype(np.float32)
    return df_levels


In [77]:
def calculate_features(df_ticker, postfix=None):
    dfs = [df_ticker]
    
    levels =       [0, 0.03, 0.05, 0.07, 0.1]
    levels_tmos =  [0, 0.03, 0.05, 0.07, 0.1]


    # #w1
    # df_close = calc_stats_diff_1(df_ticker[f'close{postfix}'], feat_name=f'close{postfix}_w1')
    # df_volume = calc_stats_diff_1(df_ticker[f'volume{postfix}'], feat_name=f'volume{postfix}_w1')
    # df_tmos_close = calc_stats_diff_1(df_ticker[f'tmos_close{postfix}'], feat_name=f'tmos_close{postfix}_w1')
    # assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], 'Error w1'
    # dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]

    #w5
    window = 5
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w10
    window = 10
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w20
    window = 20
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w30
    window = 30
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]
    
    #w60
    window = 60
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy()]
    
    #w120
    window = 120
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]

   
    df = pd.concat(dfs, axis=1)
    assert (df_ticker.shape[0] == df.shape[0]) and (df.shape[1] == sum([elem.shape[1] for elem in dfs])), 'Error concat'

    return df


#### 1hour

In [82]:
df.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.0,
1,2022-07-01 10:00:00,56.4,13083.0,ABIO,3.93,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,


In [84]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='')
    
    dfs += [df_ticker_fe.copy()]

df_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [22:31<00:00, 17.79s/it]


In [85]:
df_fe

Unnamed: 0_level_0,time,close,volume,ticker,tmos_close,index_1day,close_w5_expma,close_w5_expma_low_2std,close_w5_expma_up_2std,close_w5_expma_low_3std,...,close_w120_lvl_1.07-1.1,close_w120_lvl_-0.9-0.9299999999999999,tmos_close_w120_lvl_1-1.03,tmos_close_w120_lvl_-0.97-1,tmos_close_w120_lvl_1.03-1.05,tmos_close_w120_lvl_-0.95-0.97,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95,tmos_close_w120_lvl_1.07-1.1,tmos_close_w120_lvl_-0.9-0.9299999999999999
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-07-01 09:00:00,57.78,33.0,ABIO,4.00,,,,,,...,,,,,,,,,,
1,2022-07-01 10:00:00,56.40,13083.0,ABIO,3.93,,,,,,...,,,,,,,,,,
2,2022-07-01 11:00:00,56.36,6195.0,ABIO,3.92,,,,,,...,,,,,,,,,,
3,2022-07-01 12:00:00,56.92,7632.0,ABIO,3.97,,,,,,...,,,,,,,,,,
4,2022-07-01 13:00:00,56.18,8748.0,ABIO,3.99,,56.708889,55.410089,58.007688,54.760690,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693945,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,50953.0,4642.895062,4609.852665,4675.937459,4593.331466,...,0.0,2.0,10.0,79.0,0.0,23.0,0.0,0.0,0.0,3.0
693946,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,50953.0,4644.506173,4607.106975,4681.905371,4588.407376,...,0.0,1.0,4.0,87.0,0.0,22.0,0.0,2.0,0.0,2.0
693947,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,50953.0,4651.382716,4624.874227,4677.891205,4611.619982,...,0.0,0.0,3.0,88.0,0.0,24.0,0.0,2.0,0.0,1.0
693948,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,50953.0,4660.271605,4645.165315,4675.377895,4637.612170,...,0.0,0.0,3.0,88.0,0.0,24.0,0.0,2.0,0.0,0.0


In [86]:
(df_fe['close'].values == df['close'].values).all()

True

In [89]:
NEED_POINTS = 122

mask_avbl_all = np.zeros(df_fe.shape[0]).astype(bool)

for ticker in tqdm(stocks):
    mask_ticker = np.array(df_fe['ticker'] == ticker)
    df_ticker = df_fe[mask_ticker]
    #1. date available
    count_days = df_ticker['time'].dt.date.nunique()
    mask_avbl = np.zeros(df_fe.shape[0]).astype(bool)
    if count_days > NEED_POINTS:
        date_first_avbl =  np.sort(df_ticker['time'].dt.date.unique())[NEED_POINTS]
        #print(ticker, date_first_avbl)
        mask_avbl = np.array(df_fe["time"] >= pd.to_datetime(date_first_avbl))
    
    mask_avbl_ticker = mask_ticker & mask_avbl

    mask_avbl_all |= mask_avbl_ticker
    
mask_avbl_all.mean()

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [00:01<00:00, 43.81it/s]


0.8487196483896534

In [90]:
for col in tqdm(df_fe.columns): # consist 'index_1day']
    assert df_fe.loc[mask_avbl_all, col].isnull().sum() == 0, f'Nulls {col}'

100%|██████████████████████████████████████████████████████████████████████| 146/146 [00:00<00:00, 349.47it/s]


In [91]:
df_fe = df_fe[mask_avbl_all]
df_fe.reset_index(drop=True, inplace=True)

In [92]:
df_fe

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_expma,close_w5_expma_low_2std,close_w5_expma_up_2std,close_w5_expma_low_3std,...,close_w120_lvl_1.07-1.1,close_w120_lvl_-0.9-0.9299999999999999,tmos_close_w120_lvl_1-1.03,tmos_close_w120_lvl_-0.97-1,tmos_close_w120_lvl_1.03-1.05,tmos_close_w120_lvl_-0.95-0.97,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95,tmos_close_w120_lvl_1.07-1.1,tmos_close_w120_lvl_-0.9-0.9299999999999999
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,60.405926,60.173375,60.638477,60.057099,...,47.0,0.0,56.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,60.485679,60.135822,60.835536,59.960893,...,36.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-12-21 11:00:00,60.60,11712.0,ABIO,4.06,122.0,60.534321,60.173655,60.894987,59.993322,...,40.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,60.657778,60.113278,61.202278,59.841028,...,24.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-12-21 13:00:00,60.80,4074.0,ABIO,4.06,122.0,60.713086,60.260892,61.165281,60.034794,...,26.0,0.0,41.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588964,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,50953.0,4642.895062,4609.852665,4675.937459,4593.331466,...,0.0,2.0,10.0,79.0,0.0,23.0,0.0,0.0,0.0,3.0
588965,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,50953.0,4644.506173,4607.106975,4681.905371,4588.407376,...,0.0,1.0,4.0,87.0,0.0,22.0,0.0,2.0,0.0,2.0
588966,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,50953.0,4651.382716,4624.874227,4677.891205,4611.619982,...,0.0,0.0,3.0,88.0,0.0,24.0,0.0,2.0,0.0,1.0
588967,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,50953.0,4660.271605,4645.165315,4675.377895,4637.612170,...,0.0,0.0,3.0,88.0,0.0,24.0,0.0,2.0,0.0,0.0


In [94]:
dump_pkl(df_fe, './data/feat_engin/df_fe.pkl')

#### 1day

In [100]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1day')
    
    dfs += [df_ticker_fe.copy()]

df_1day_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 76/76 [01:38<00:00,  1.29s/it]


In [101]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index.values == df_1day_fe['index_1day'].values).all()

(True, True)

In [102]:
df_1day_fe

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index,close_1day_w5_expma,close_1day_w5_expma_low_2std,close_1day_w5_expma_up_2std,...,close_1day_w120_lvl_1.07-1.1,close_1day_w120_lvl_-0.9-0.9299999999999999,tmos_close_1day_w120_lvl_1-1.03,tmos_close_1day_w120_lvl_-0.97-1,tmos_close_1day_w120_lvl_1.03-1.05,tmos_close_1day_w120_lvl_-0.95-0.97,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95,tmos_close_1day_w120_lvl_1.07-1.1,tmos_close_1day_w120_lvl_-0.9-0.9299999999999999
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30,,,,...,,,,,,,,,,
1,1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01,,,,...,,,,,,,,,,
2,2,2022-07-04 03:00:00,56.90,58944.0,ABIO,3.92,2022-07-04,,,,...,,,,,,,,,,
3,3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05,,,,...,,,,,,,,,,
4,4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06,58.164938,54.524213,61.805663,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50950,50950,2025-02-18 03:00:00,4550.00,1392270.0,YDEX,7.12,2025-02-18,4573.432099,4404.286599,4742.577598,...,0.0,13.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,14.0
50951,50951,2025-02-19 03:00:00,4646.00,1127973.0,YDEX,7.22,2025-02-19,4594.000000,4420.279535,4767.720465,...,0.0,5.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,3.0
50952,50952,2025-02-20 03:00:00,4649.00,1692660.0,YDEX,7.22,2025-02-20,4619.839506,4480.637495,4759.041518,...,0.0,5.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,3.0
50953,50953,2025-02-21 03:00:00,4667.00,685047.0,YDEX,7.21,2025-02-21,4656.629630,4539.930015,4773.329244,...,0.0,4.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0


In [104]:
dump_pkl(df_1day_fe, './data/feat_engin/df_1day_fe.pkl')

# #Load data

In [128]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import gc

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [130]:
df_fe = load_pkl('./data/feat_engin/df_fe.pkl')
df_1day_fe = load_pkl('./data/feat_engin/df_1day_fe.pkl')

df_fe.shape, df_1day_fe.shape, 

((588969, 146), (50955, 147))

In [132]:
df_fe.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_expma,close_w5_expma_low_2std,close_w5_expma_up_2std,close_w5_expma_low_3std,...,close_w120_lvl_1.07-1.1,close_w120_lvl_-0.9-0.9299999999999999,tmos_close_w120_lvl_1-1.03,tmos_close_w120_lvl_-0.97-1,tmos_close_w120_lvl_1.03-1.05,tmos_close_w120_lvl_-0.95-0.97,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95,tmos_close_w120_lvl_1.07-1.1,tmos_close_w120_lvl_-0.9-0.9299999999999999
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,60.405926,60.173375,60.638477,60.057099,...,47.0,0.0,56.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,60.485679,60.135822,60.835536,59.960893,...,36.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-12-21 11:00:00,60.6,11712.0,ABIO,4.06,122.0,60.534321,60.173655,60.894987,59.993322,...,40.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,60.657778,60.113278,61.202278,59.841028,...,24.0,0.0,42.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-12-21 13:00:00,60.8,4074.0,ABIO,4.06,122.0,60.713086,60.260892,61.165281,60.034794,...,26.0,0.0,41.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0


### time features

In [138]:
#hour
df_fe['hour'] = df_fe['time'].dt.hour

#day
df_fe['day'] = df_fe['time'].dt.day

#day_of_week
df_fe['weekday'] = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4

#month
#df_fe['month'] = df_fe['time'].dt.month


#hour
# time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
# df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
# df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
# day_of_week_cyclic = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4
# df_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
# df_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
# day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
# df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
# df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


In [141]:
df_fe.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_expma,close_w5_expma_low_2std,close_w5_expma_up_2std,close_w5_expma_low_3std,...,tmos_close_w120_lvl_-0.97-1,tmos_close_w120_lvl_1.03-1.05,tmos_close_w120_lvl_-0.95-0.97,tmos_close_w120_lvl_1.05-1.07,tmos_close_w120_lvl_-0.9299999999999999-0.95,tmos_close_w120_lvl_1.07-1.1,tmos_close_w120_lvl_-0.9-0.9299999999999999,hour,day,weekday
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,60.405926,60.173375,60.638477,60.057099,...,50.0,0.0,0.0,0.0,0.0,0.0,0.0,9,21,0.5
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,60.485679,60.135822,60.835536,59.960893,...,64.0,0.0,0.0,0.0,0.0,0.0,0.0,10,21,0.5
2,2022-12-21 11:00:00,60.6,11712.0,ABIO,4.06,122.0,60.534321,60.173655,60.894987,59.993322,...,64.0,0.0,0.0,0.0,0.0,0.0,0.0,11,21,0.5
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,60.657778,60.113278,61.202278,59.841028,...,64.0,0.0,0.0,0.0,0.0,0.0,0.0,12,21,0.5
4,2022-12-21 13:00:00,60.8,4074.0,ABIO,4.06,122.0,60.713086,60.260892,61.165281,60.034794,...,64.0,0.0,0.0,0.0,0.0,0.0,0.0,13,21,0.5


In [143]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,tmos_close_1day,date_day_index,close_1day_w5_expma,close_1day_w5_expma_low_2std,close_1day_w5_expma_up_2std,...,close_1day_w120_lvl_1.07-1.1,close_1day_w120_lvl_-0.9-0.9299999999999999,tmos_close_1day_w120_lvl_1-1.03,tmos_close_1day_w120_lvl_-0.97-1,tmos_close_1day_w120_lvl_1.03-1.05,tmos_close_1day_w120_lvl_-0.95-0.97,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95,tmos_close_1day_w120_lvl_1.07-1.1,tmos_close_1day_w120_lvl_-0.9-0.9299999999999999
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-06-30 03:00:00,57.82,111300.0,ABIO,4.02,2022-06-30,,,,...,,,,,,,,,,
1,1,2022-07-01 03:00:00,56.28,48126.0,ABIO,3.96,2022-07-01,,,,...,,,,,,,,,,
2,2,2022-07-04 03:00:00,56.9,58944.0,ABIO,3.92,2022-07-04,,,,...,,,,,,,,,,
3,3,2022-07-05 03:00:00,56.38,39756.0,ABIO,3.97,2022-07-05,,,,...,,,,,,,,,,
4,4,2022-07-06 03:00:00,60.68,275700.0,ABIO,3.96,2022-07-06,58.164938,54.524213,61.805663,...,,,,,,,,,,


### Absolute value columns

In [148]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False



In [150]:
cols_del_1hour = [elem for elem in df_fe.columns if flag_delete(elem)]
len(cols_del_1hour), cols_del_1hour

(84,
 ['close_w5_expma',
  'close_w5_expma_low_2std',
  'close_w5_expma_up_2std',
  'close_w5_expma_low_3std',
  'close_w5_expma_up_3std',
  'close_w5_min',
  'close_w5_max',
  'tmos_close_w5_expma',
  'tmos_close_w5_expma_low_2std',
  'tmos_close_w5_expma_up_2std',
  'tmos_close_w5_expma_low_3std',
  'tmos_close_w5_expma_up_3std',
  'tmos_close_w5_min',
  'tmos_close_w5_max',
  'close_w10_expma',
  'close_w10_expma_low_2std',
  'close_w10_expma_up_2std',
  'close_w10_expma_low_3std',
  'close_w10_expma_up_3std',
  'close_w10_min',
  'close_w10_max',
  'tmos_close_w10_expma',
  'tmos_close_w10_expma_low_2std',
  'tmos_close_w10_expma_up_2std',
  'tmos_close_w10_expma_low_3std',
  'tmos_close_w10_expma_up_3std',
  'tmos_close_w10_min',
  'tmos_close_w10_max',
  'close_w20_expma',
  'close_w20_expma_low_2std',
  'close_w20_expma_up_2std',
  'close_w20_expma_low_3std',
  'close_w20_expma_up_3std',
  'close_w20_min',
  'close_w20_max',
  'tmos_close_w20_expma',
  'tmos_close_w20_expma_low_

In [152]:
cols_del_1day = [elem for elem in df_1day_fe.columns if flag_delete(elem)]
len(cols_del_1day), cols_del_1day

(84,
 ['close_1day_w5_expma',
  'close_1day_w5_expma_low_2std',
  'close_1day_w5_expma_up_2std',
  'close_1day_w5_expma_low_3std',
  'close_1day_w5_expma_up_3std',
  'close_1day_w5_min',
  'close_1day_w5_max',
  'tmos_close_1day_w5_expma',
  'tmos_close_1day_w5_expma_low_2std',
  'tmos_close_1day_w5_expma_up_2std',
  'tmos_close_1day_w5_expma_low_3std',
  'tmos_close_1day_w5_expma_up_3std',
  'tmos_close_1day_w5_min',
  'tmos_close_1day_w5_max',
  'close_1day_w10_expma',
  'close_1day_w10_expma_low_2std',
  'close_1day_w10_expma_up_2std',
  'close_1day_w10_expma_low_3std',
  'close_1day_w10_expma_up_3std',
  'close_1day_w10_min',
  'close_1day_w10_max',
  'tmos_close_1day_w10_expma',
  'tmos_close_1day_w10_expma_low_2std',
  'tmos_close_1day_w10_expma_up_2std',
  'tmos_close_1day_w10_expma_low_3std',
  'tmos_close_1day_w10_expma_up_3std',
  'tmos_close_1day_w10_min',
  'tmos_close_1day_w10_max',
  'close_1day_w20_expma',
  'close_1day_w20_expma_low_2std',
  'close_1day_w20_expma_up_2st

## Concat

In [158]:
df_fe.columns.tolist()

['time',
 'close',
 'volume',
 'ticker',
 'tmos_close',
 'index_1day',
 'close_w5_expma',
 'close_w5_expma_low_2std',
 'close_w5_expma_up_2std',
 'close_w5_expma_low_3std',
 'close_w5_expma_up_3std',
 'close_w5_alpha',
 'close_w5_min',
 'close_w5_max',
 'close_w5_rsi',
 'tmos_close_w5_expma',
 'tmos_close_w5_expma_low_2std',
 'tmos_close_w5_expma_up_2std',
 'tmos_close_w5_expma_low_3std',
 'tmos_close_w5_expma_up_3std',
 'tmos_close_w5_alpha',
 'tmos_close_w5_min',
 'tmos_close_w5_max',
 'tmos_close_w5_rsi',
 'close_w10_expma',
 'close_w10_expma_low_2std',
 'close_w10_expma_up_2std',
 'close_w10_expma_low_3std',
 'close_w10_expma_up_3std',
 'close_w10_alpha',
 'close_w10_min',
 'close_w10_max',
 'close_w10_rsi',
 'tmos_close_w10_expma',
 'tmos_close_w10_expma_low_2std',
 'tmos_close_w10_expma_up_2std',
 'tmos_close_w10_expma_low_3std',
 'tmos_close_w10_expma_up_3std',
 'tmos_close_w10_alpha',
 'tmos_close_w10_min',
 'tmos_close_w10_max',
 'tmos_close_w10_rsi',
 'close_w20_expma',
 'clo

In [160]:
df_1day_fe.columns.tolist()

['index_1day',
 'time_1day',
 'close_1day',
 'volume_1day',
 'ticker',
 'tmos_close_1day',
 'date_day_index',
 'close_1day_w5_expma',
 'close_1day_w5_expma_low_2std',
 'close_1day_w5_expma_up_2std',
 'close_1day_w5_expma_low_3std',
 'close_1day_w5_expma_up_3std',
 'close_1day_w5_alpha',
 'close_1day_w5_min',
 'close_1day_w5_max',
 'close_1day_w5_rsi',
 'tmos_close_1day_w5_expma',
 'tmos_close_1day_w5_expma_low_2std',
 'tmos_close_1day_w5_expma_up_2std',
 'tmos_close_1day_w5_expma_low_3std',
 'tmos_close_1day_w5_expma_up_3std',
 'tmos_close_1day_w5_alpha',
 'tmos_close_1day_w5_min',
 'tmos_close_1day_w5_max',
 'tmos_close_1day_w5_rsi',
 'close_1day_w10_expma',
 'close_1day_w10_expma_low_2std',
 'close_1day_w10_expma_up_2std',
 'close_1day_w10_expma_low_3std',
 'close_1day_w10_expma_up_3std',
 'close_1day_w10_alpha',
 'close_1day_w10_min',
 'close_1day_w10_max',
 'close_1day_w10_rsi',
 'tmos_close_1day_w10_expma',
 'tmos_close_1day_w10_expma_low_2std',
 'tmos_close_1day_w10_expma_up_2std

In [163]:
df_fe.shape, df_1day_fe.shape

((588969, 149), (50955, 147))

In [165]:
df = df_fe.merge(df_1day_fe, on=['index_1day', 'ticker'], how='left')
df.shape

(588969, 294)

In [166]:
df.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_expma,close_w5_expma_low_2std,close_w5_expma_up_2std,close_w5_expma_low_3std,...,close_1day_w120_lvl_1.07-1.1,close_1day_w120_lvl_-0.9-0.9299999999999999,tmos_close_1day_w120_lvl_1-1.03,tmos_close_1day_w120_lvl_-0.97-1,tmos_close_1day_w120_lvl_1.03-1.05,tmos_close_1day_w120_lvl_-0.95-0.97,tmos_close_1day_w120_lvl_1.05-1.07,tmos_close_1day_w120_lvl_-0.9299999999999999-0.95,tmos_close_1day_w120_lvl_1.07-1.1,tmos_close_1day_w120_lvl_-0.9-0.9299999999999999
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,60.405926,60.173375,60.638477,60.057099,...,25.0,3.0,32.0,25.0,0.0,11.0,2.0,8.0,11.0,19.0
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,60.485679,60.135822,60.835536,59.960893,...,25.0,3.0,32.0,25.0,0.0,11.0,2.0,8.0,11.0,19.0
2,2022-12-21 11:00:00,60.6,11712.0,ABIO,4.06,122.0,60.534321,60.173655,60.894987,59.993322,...,25.0,3.0,32.0,25.0,0.0,11.0,2.0,8.0,11.0,19.0
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,60.657778,60.113278,61.202278,59.841028,...,25.0,3.0,32.0,25.0,0.0,11.0,2.0,8.0,11.0,19.0
4,2022-12-21 13:00:00,60.8,4074.0,ABIO,4.06,122.0,60.713086,60.260892,61.165281,60.034794,...,25.0,3.0,32.0,25.0,0.0,11.0,2.0,8.0,11.0,19.0


In [171]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'
    # if df[col].isnull().sum() != 0:
    #     print(col, df[col].isnull().sum())

100%|█████████████████████████████████████████████████████████████████████| 294/294 [00:00<00:00, 1120.76it/s]


### Relative features

In [174]:
groups_1hour = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #tmos_close: w1, w5, w10, w20, w30, w60, w120

         ['close_w5_alpha', 'close_w10_alpha', 'close_w20_alpha', 'close_w30_alpha', 'close_w60_alpha', 'close_w120_alpha'],
         ['tmos_close_w5_alpha', 'tmos_close_w10_alpha', 'tmos_close_w20_alpha', 'tmos_close_w30_alpha', 'tmos_close_w60_alpha', 'tmos_close_w120_alpha'],


         ['close', 'close_w5_expma', 'close_w10_expma', 'close_w20_expma', 'close_w30_expma', 'close_w60_expma', 'close_w120_expma'],
         ['tmos_close', 'tmos_close_w5_expma', 'tmos_close_w10_expma', 'tmos_close_w20_expma', 'tmos_close_w30_expma', 'tmos_close_w60_expma', 'tmos_close_w120_expma'],


         {'close' : ['close_w5_min', 'close_w10_min', 'close_w20_min', 'close_w30_min', 'close_w60_min', 'close_w120_min']},
         {'tmos_close' : ['tmos_close_w5_min', 'tmos_close_w10_min', 'tmos_close_w20_min', 'tmos_close_w30_min', 'tmos_close_w60_min', 'tmos_close_w120_min']},

    
         {'close' : ['close_w5_max', 'close_w10_max', 'close_w20_max', 'close_w30_max', 'close_w60_max', 'close_w120_max']},
         {'tmos_close' : ['tmos_close_w5_max', 'tmos_close_w10_max', 'tmos_close_w20_max', 'tmos_close_w30_max', 'tmos_close_w60_max', 'tmos_close_w120_max']},



        #w5
         {'close' : [  'close_w5_expma_low_2std', 'close_w5_expma_up_2std', 'close_w5_expma_low_3std', 'close_w5_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w5_expma_low_2std', 'tmos_close_w5_expma_up_2std', 'tmos_close_w5_expma_low_3std', 'tmos_close_w5_expma_up_3std']},
        #w10
         {'close' : [  'close_w10_expma_low_2std', 'close_w10_expma_up_2std', 'close_w10_expma_low_3std', 'close_w10_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w10_expma_low_2std', 'tmos_close_w10_expma_up_2std', 'tmos_close_w10_expma_low_3std', 'tmos_close_w10_expma_up_3std']},
        #w20
         {'close' : [  'close_w20_expma_low_2std', 'close_w20_expma_up_2std', 'close_w20_expma_low_3std', 'close_w20_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w20_expma_low_2std', 'tmos_close_w20_expma_up_2std', 'tmos_close_w20_expma_low_3std', 'tmos_close_w20_expma_up_3std']},
        #w30
         {'close' : [  'close_w30_expma_low_2std', 'close_w30_expma_up_2std', 'close_w30_expma_low_3std', 'close_w30_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w30_expma_low_2std', 'tmos_close_w30_expma_up_2std', 'tmos_close_w30_expma_low_3std', 'tmos_close_w30_expma_up_3std']},
        #w60
         {'close' : [  'close_w60_expma_low_2std', 'close_w60_expma_up_2std', 'close_w60_expma_low_3std', 'close_w60_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w60_expma_low_2std', 'tmos_close_w60_expma_up_2std', 'tmos_close_w60_expma_low_3std', 'tmos_close_w60_expma_up_3std']},
        #w120
         {'close' : [  'close_w120_expma_low_2std', 'close_w120_expma_up_2std', 'close_w120_expma_low_3std', 'close_w120_expma_up_3std']},
         {'tmos_close' : [  'tmos_close_w120_expma_low_2std', 'tmos_close_w120_expma_up_2std', 'tmos_close_w120_expma_low_3std', 'tmos_close_w120_expma_up_3std']},


        #w5
        ['close_w5_min', 'close_w5_max'],
        ['tmos_close_w5_min', 'tmos_close_w5_max'],
        #w10
        ['close_w10_min', 'close_w10_max'],
        ['tmos_close_w10_min', 'tmos_close_w10_max'],
        #w20
        ['close_w20_min', 'close_w20_max'],
        ['tmos_close_w20_min', 'tmos_close_w20_max'],
        #w30
        ['close_w30_min', 'close_w30_max'],
        ['tmos_close_w30_min', 'tmos_close_w30_max'],
        #w60
        ['close_w60_min', 'close_w60_max'],
        ['tmos_close_w60_min', 'tmos_close_w60_max'],
        #w120
        ['close_w120_min', 'close_w120_max'],
        ['tmos_close_w120_min', 'tmos_close_w120_max'],
]

In [176]:
groups_1day= [
        #windows: 
        #close_1day: w1, w5, w10, w20, w30, w60, w120
        #tmos_close_1day: w1, w5, w10, w20, w30, w60, w120

    
         ['close_1day_w5_alpha', 'close_1day_w10_alpha', 'close_1day_w20_alpha', 'close_1day_w30_alpha', 'close_1day_w60_alpha', 'close_1day_w120_alpha'],
         ['tmos_close_1day_w5_alpha', 'tmos_close_1day_w10_alpha', 'tmos_close_1day_w20_alpha', 'tmos_close_1day_w30_alpha', 'tmos_close_1day_w60_alpha', 'tmos_close_1day_w120_alpha'],

         ['close_1day', 'close_1day_w5_expma', 'close_1day_w10_expma', 'close_1day_w20_expma', 'close_1day_w30_expma', 'close_1day_w60_expma', 'close_1day_w120_expma'],
         ['tmos_close_1day', 'tmos_close_1day_w5_expma', 'tmos_close_1day_w10_expma', 'tmos_close_1day_w20_expma', 'tmos_close_1day_w30_expma', 'tmos_close_1day_w60_expma', 'tmos_close_1day_w120_expma'],


         {'close_1day' : ['close_1day_w5_min', 'close_1day_w10_min', 'close_1day_w20_min', 'close_1day_w30_min', 'close_1day_w60_min', 'close_1day_w120_min']},
         {'tmos_close_1day' : ['tmos_close_1day_w5_min', 'tmos_close_1day_w10_min', 'tmos_close_1day_w20_min', 'tmos_close_1day_w30_min', 'tmos_close_1day_w60_min', 'tmos_close_1day_w120_min']},

    
         {'close_1day' : ['close_1day_w5_max', 'close_1day_w10_max', 'close_1day_w20_max', 'close_1day_w30_max', 'close_1day_w60_max', 'close_1day_w120_max']},
         {'tmos_close_1day' : ['tmos_close_1day_w5_max', 'tmos_close_1day_w10_max', 'tmos_close_1day_w20_max', 'tmos_close_1day_w30_max', 'tmos_close_1day_w60_max', 'tmos_close_1day_w120_max']},



        #w5
         {'close_1day' : [  'close_1day_w5_expma_low_2std', 'close_1day_w5_expma_up_2std', 'close_1day_w5_expma_low_3std', 'close_1day_w5_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w5_expma_low_2std', 'tmos_close_1day_w5_expma_up_2std', 'tmos_close_1day_w5_expma_low_3std', 'tmos_close_1day_w5_expma_up_3std']},
        #w10
         {'close_1day' : [  'close_1day_w10_expma_low_2std', 'close_1day_w10_expma_up_2std', 'close_1day_w10_expma_low_3std', 'close_1day_w10_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w10_expma_low_2std', 'tmos_close_1day_w10_expma_up_2std', 'tmos_close_1day_w10_expma_low_3std', 'tmos_close_1day_w10_expma_up_3std']},
        #w20
         {'close_1day' : [  'close_1day_w20_expma_low_2std', 'close_1day_w20_expma_up_2std', 'close_1day_w20_expma_low_3std', 'close_1day_w20_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w20_expma_low_2std', 'tmos_close_1day_w20_expma_up_2std', 'tmos_close_1day_w20_expma_low_3std', 'tmos_close_1day_w20_expma_up_3std']},
        #w30
         {'close_1day' : [  'close_1day_w30_expma_low_2std', 'close_1day_w30_expma_up_2std', 'close_1day_w30_expma_low_3std', 'close_1day_w30_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w30_expma_low_2std', 'tmos_close_1day_w30_expma_up_2std', 'tmos_close_1day_w30_expma_low_3std', 'tmos_close_1day_w30_expma_up_3std']},
        #w60
         {'close_1day' : [  'close_1day_w60_expma_low_2std', 'close_1day_w60_expma_up_2std', 'close_1day_w60_expma_low_3std', 'close_1day_w60_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w60_expma_low_2std', 'tmos_close_1day_w60_expma_up_2std', 'tmos_close_1day_w60_expma_low_3std', 'tmos_close_1day_w60_expma_up_3std']},
        #w120
         {'close_1day' : [  'close_1day_w120_expma_low_2std', 'close_1day_w120_expma_up_2std', 'close_1day_w120_expma_low_3std', 'close_1day_w120_expma_up_3std']},
         {'tmos_close_1day' : [  'tmos_close_1day_w120_expma_low_2std', 'tmos_close_1day_w120_expma_up_2std', 'tmos_close_1day_w120_expma_low_3std', 'tmos_close_1day_w120_expma_up_3std']},


        #w5
        ['close_1day_w5_min', 'close_1day_w5_max'],
        ['tmos_close_1day_w5_min', 'tmos_close_1day_w5_max'],
        #w10
        ['close_1day_w10_min', 'close_1day_w10_max'],
        ['tmos_close_1day_w10_min', 'tmos_close_1day_w10_max'],
        #w20
        ['close_1day_w20_min', 'close_1day_w20_max'],
        ['tmos_close_1day_w20_min', 'tmos_close_1day_w20_max'],
        #w30
        ['close_1day_w30_min', 'close_1day_w30_max'],
        ['tmos_close_1day_w30_min', 'tmos_close_1day_w30_max'],
        #w60
        ['close_1day_w60_min', 'close_1day_w60_max'],
        ['tmos_close_1day_w60_min', 'tmos_close_1day_w60_max'],
        #w120
        ['close_1day_w120_min', 'close_1day_w120_max'],
        ['tmos_close_1day_w120_min', 'tmos_close_1day_w120_max'],
]

In [179]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = df[pair[0]] / (df[pair[1]] + np.finfo(np.float64).eps)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = df[pair0] / (df[pair1] + np.finfo(np.float64).eps)
                

    return df

In [181]:
import warnings
warnings.filterwarnings('ignore')

In [185]:
print(df.shape)
calc_relative_features(df, groups_1hour)
df.shape

(588969, 294)


100%|█████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 86.73it/s]


(588969, 450)

In [188]:
print(df.shape)
calc_relative_features(df, groups_1day)
df.shape

(588969, 450)


100%|█████████████████████████████████████████████████████████████████████████| 32/32 [00:00<00:00, 66.02it/s]


(588969, 606)

In [192]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'

100%|█████████████████████████████████████████████████████████████████████| 606/606 [00:00<00:00, 1415.33it/s]


### Delete absolute valu columns

In [196]:
len(cols_del_1hour), len(cols_del_1day)

(84, 84)

In [198]:
print(df.shape)
for col in cols_del_1hour+cols_del_1day:
    del df[col]
df.shape

(588969, 606)


(588969, 438)

In [202]:
# pd.set_option('display.max_rows', 2000)
# df.dtypes

## 2.6 Save data

In [206]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [208]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_1hour_1day.pkl')

In [210]:
df.shape

(588969, 438)

In [212]:
df.head()

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_alpha,close_w5_rsi,tmos_close_w5_alpha,tmos_close_w5_rsi,...,close_1day_w10_min/close_1day_w10_max,tmos_close_1day_w10_min/tmos_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,tmos_close_1day_w30_min/tmos_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,tmos_close_1day_w60_min/tmos_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,tmos_close_1day_w120_min/tmos_close_1day_w120_max
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,-9.985207,53.658537,134.615385,100.0,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,5.555556,57.777778,117.647059,100.0,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
2,2022-12-21 11:00:00,60.6,11712.0,ABIO,4.06,122.0,5.842558,65.0,117.647059,100.0,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,5.261738,74.074074,156.25,100.0,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
4,2022-12-21 13:00:00,60.8,4074.0,ABIO,4.06,122.0,5.379499,72.727273,250.0,100.0,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101


In [214]:
df.columns.tolist()

['time',
 'close',
 'volume',
 'ticker',
 'tmos_close',
 'index_1day',
 'close_w5_alpha',
 'close_w5_rsi',
 'tmos_close_w5_alpha',
 'tmos_close_w5_rsi',
 'close_w10_alpha',
 'close_w10_rsi',
 'tmos_close_w10_alpha',
 'tmos_close_w10_rsi',
 'close_w20_alpha',
 'close_w20_rsi',
 'tmos_close_w20_alpha',
 'tmos_close_w20_rsi',
 'close_w30_alpha',
 'close_w30_rsi',
 'tmos_close_w30_alpha',
 'tmos_close_w30_rsi',
 'close_w30_lvl_1-1.03',
 'close_w30_lvl_-0.97-1',
 'close_w30_lvl_1.03-1.05',
 'close_w30_lvl_-0.95-0.97',
 'close_w30_lvl_1.05-1.07',
 'close_w30_lvl_-0.9299999999999999-0.95',
 'close_w30_lvl_1.07-1.1',
 'close_w30_lvl_-0.9-0.9299999999999999',
 'tmos_close_w30_lvl_1-1.03',
 'tmos_close_w30_lvl_-0.97-1',
 'tmos_close_w30_lvl_1.03-1.05',
 'tmos_close_w30_lvl_-0.95-0.97',
 'tmos_close_w30_lvl_1.05-1.07',
 'tmos_close_w30_lvl_-0.9299999999999999-0.95',
 'tmos_close_w30_lvl_1.07-1.1',
 'tmos_close_w30_lvl_-0.9-0.9299999999999999',
 'close_w60_alpha',
 'close_w60_rsi',
 'tmos_close_w6

In [351]:
no_features = [
 'time',
 'close',
 'tmos_close',
 'ticker',

 'index_1day',
 'date_day_index',
    
 'time_1day',
 'close_1day',
 'tmos_close_1day',]


### 2.7 Save data to NN

In [330]:
#don't use NN

In [333]:
# df_fe.head()

In [335]:
# df_1day_fe.head()

In [337]:
# df_fe.shape, df_1day_fe.shape

In [339]:
# dump_pkl(df_fe, 'data/feat_engin/lgbm/data_1hour.pkl')
# dump_pkl(df_1day_fe, 'data/feat_engin/lgbm/data_1day.pkl')

# TMP

In [343]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [345]:
df = load_pkl('data/feat_engin/lgbm/data_1hour_1day.pkl')

In [347]:
df

Unnamed: 0,time,close,volume,ticker,tmos_close,index_1day,close_w5_rsi,tmos_close_w5_rsi,close_w10_rsi,tmos_close_w10_rsi,...,close_1day_w10_min/close_1day_w10_max,tmos_close_1day_w10_min/tmos_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,tmos_close_1day_w20_min/tmos_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,tmos_close_1day_w30_min/tmos_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,tmos_close_1day_w60_min/tmos_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,tmos_close_1day_w120_min/tmos_close_1day_w120_max
0,2022-12-21 09:00:00,60.36,63.0,ABIO,4.05,122.0,53.658537,100.000000,49.586777,100.000000,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
1,2022-12-21 10:00:00,60.74,37821.0,ABIO,4.06,122.0,57.777778,100.000000,67.521368,100.000000,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
2,2022-12-21 11:00:00,60.60,11712.0,ABIO,4.06,122.0,65.000000,100.000000,61.206897,100.000000,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
3,2022-12-21 12:00:00,60.96,3129.0,ABIO,4.06,122.0,74.074074,100.000000,63.414634,100.000000,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
4,2022-12-21 13:00:00,60.80,4074.0,ABIO,4.06,122.0,72.727273,100.000000,52.678571,100.000000,...,0.919266,0.970588,0.909531,0.965854,0.892783,0.956522,0.724484,0.835749,0.577882,0.765101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
588964,2025-02-24 19:00:00,4645.50,26140.0,YDEX,7.25,50953.0,45.789474,71.428571,38.526912,54.545455,...,0.895519,0.886921,0.837757,0.862398,0.837757,0.847411,0.681886,0.711172,0.681886,0.711172
588965,2025-02-24 20:00:00,4658.00,19307.0,YDEX,7.27,50953.0,58.031088,100.000000,51.935484,58.333333,...,0.895519,0.886921,0.837757,0.862398,0.837757,0.847411,0.681886,0.711172,0.681886,0.711172
588966,2025-02-24 21:00:00,4660.00,27796.0,YDEX,7.29,50953.0,80.000000,100.000000,48.972603,75.000000,...,0.895519,0.886921,0.837757,0.862398,0.837757,0.847411,0.681886,0.711172,0.681886,0.711172
588967,2025-02-24 22:00:00,4666.00,13530.0,YDEX,7.29,50953.0,77.862595,100.000000,52.188552,75.000000,...,0.895519,0.886921,0.837757,0.862398,0.837757,0.847411,0.681886,0.711172,0.681886,0.711172


#### uniq_1

In [362]:
no_analyze = [
'time',
 # 'close',
 # 'tmos_close',
 'ticker',

 # 'index_1day',
 'date_day_index',
    
  'time_1day',
 # 'close_1day',
 # 'tmos_close_1day',
]

In [365]:
# df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].mean().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].mean()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 350/350 [00:00<00:00, 2042.15it/s]


Unnamed: 0,index,0
0,close,1324.305797
1,volume,123089.619494
2,tmos_close,5.927450
3,index_1day,25593.817829
4,close_w5_rsi,50.193852
...,...,...
345,tmos_close_1day_w30_min/tmos_close_1day_w30_max,0.910564
346,close_1day_w60_min/close_1day_w60_max,0.776903
347,tmos_close_1day_w60_min/tmos_close_1day_w60_max,0.867584
348,close_1day_w120_min/close_1day_w120_max,0.673623


In [367]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [369]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

Series([], Name: index, dtype: object)

In [371]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].values

array([], dtype=object)

#### uniq_2

In [377]:
#df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].nunique().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].nunique()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|██████████████████████████████████████████████████████████████████████| 350/350 [00:01<00:00, 306.74it/s]


Unnamed: 0,index,0
0,close,98415
1,volume,178047
2,tmos_close,325
3,index_1day,41719
4,close_w5_rsi,201828
...,...,...
345,tmos_close_1day_w30_min/tmos_close_1day_w30_max,308
346,close_1day_w60_min/close_1day_w60_max,12597
347,tmos_close_1day_w60_min/tmos_close_1day_w60_max,299
348,close_1day_w120_min/close_1day_w120_max,10119


In [379]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [381]:
pd.set_option('display.max_rows', 120)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,18,"[tmos_close_w30_lvl_1.07-1.1, tmos_close_1day_..."
1,19,"[tmos_close_w30_lvl_1.05-1.07, tmos_close_1day..."
2,21,"[tmos_close_1day_w30_lvl_1.03-1.05, tmos_close..."
3,22,"[tmos_close_w30_lvl_-0.9299999999999999-0.95, ..."
4,24,"[close_1day_w30_lvl_1.07-1.1, tmos_close_1day_..."
5,25,"[close_1day_w30_lvl_1.03-1.05, close_1day_w30_..."
6,27,"[tmos_close_w30_lvl_1.03-1.05, tmos_close_1day..."
7,30,"[close_w30_lvl_1-1.03, close_w30_lvl_-0.97-1, ..."
8,48,"[tmos_close_1day_w120_lvl_-0.95-0.97, tmos_clo..."
9,57,"[close_1day_w120_lvl_1.05-1.07, tmos_close_1da..."


In [386]:
i = 25
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(36769,
 array(['tmos_close/tmos_close_w60_expma',
        'tmos_close_w5_expma/tmos_close_w60_expma',
        'tmos_close_w10_expma/tmos_close_w60_expma',
        'tmos_close_w20_expma/tmos_close_w60_expma',
        'tmos_close_w30_expma/tmos_close_w60_expma'], dtype=object))

In [389]:
np.sort(df['tmos_close/tmos_close_w60_expma'].unique())

array([0.93309498, 0.93582123, 0.93691547, ..., 1.08817353, 1.0909974 ,
       1.0922795 ])