In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [3]:
stocks = ['ABIO',
 'AFKS',
 'AFLT',
 'ALRS',
 'APTK',
 'AQUA',
 'BANE',
 'BANEP',
 'BELU',
 'BSPB',
 'CBOM',
 'CHMF',
 'ENPG',
 'FEES',
 'FESH',
 'FLOT',
 'GAZP',
 'GMKN',
 'GTRK',
 'HEAD',
 'HYDR',
 'IRAO',
 'IRKT',
 'KMAZ',
 'LENT',
 'LIFE',
 'LKOH',
 'LSRG',
 'MAGN',
 'MDMG',
 'MGNT',
 'MOEX',
 'MTLR',
 'MTLRP',
 'MTSS',
 'MVID',
 'NLMK',
 'NMTP',
 'NVTK',
 'OGKB',
 'PHOR',
 'PIKK',
 'PLZL',
 'POSI',
 'RASP',
 'RENI',
 'RNFT',
 'ROSN',
 'RTKM',
 'RTKMP',
 'RUAL',
 'SBER',
 'SBERP',
 'SELG',
 'SFIN',
 'SGZH',
 'SIBN',
 'SMLT',
 'SNGS',
 'SNGSP',
 'SPBE',
 'SVAV',
 'T',
 'TATN',
 'TATNP',
 'TGKN',
 'TRMK',
 'TRNFP',
 'UNAC',
 'UPRO',
 'UWGN',
 'VKCO',
 'VSMO',
 'VTBR',
 'WUSH',
 'YDEX',


#'TMOS',
]

len(stocks)

76

### 2.5 Feature engineering

In [10]:
from sklearn.linear_model import LinearRegression

def calculate_exp_ma(data, window):
    alpha = 2 / (window + 1)
    coeffs = ((1 - alpha)**(np.arange(window)[::-1])) * (alpha)
    coeffs[0] /= alpha
    
    return data.rolling(window=window, min_periods=window).apply(lambda x: (x*coeffs).sum()).values


def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=window).mean().values
    rolling_std = data.rolling(window=window, min_periods=window).std().values
    norm_rolling_std = rolling_std / (rolling_mean + np.finfo(np.float32).eps)

    num_of_std = 2
    lower_band_2std = rolling_mean - (rolling_std * num_of_std)
    upper_band_2std = rolling_mean + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = rolling_mean - (rolling_std * num_of_std)
    upper_band_3std = rolling_mean + (rolling_std * num_of_std)
    
    
    return rolling_mean, rolling_std, norm_rolling_std, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change"""
    roc = (data - data.shift(periods)) / (data.shift(periods)+np.finfo(np.float32).eps)
    return roc



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std, norm_rolling_std,\
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    #mean_abs_pct
    mean_abs_pct = calculate_roc(data, 1).rolling(window=window, min_periods=window).apply(lambda x: x.abs().mean()).values
        
    #alpha
    alpha = data.rolling(window=window, min_periods=window).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0]).values

    #min, max
    rolling_min = data.rolling(window=window, min_periods=window).min().values
    rolling_max = data.rolling(window=window, min_periods=window).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window).values
    # diff = data.diff(window).values

    #exp_ma
    exp_ma = calculate_exp_ma(data, window)
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_ma_low_2std' : lower_band_2std,
                        f'{feat_name}_ma_up_2std' : upper_band_2std,
                        f'{feat_name}_ma_low_3std' : lower_band_3std,
                        f'{feat_name}_ma_up_3std' : upper_band_3std, 

                        f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        # f'{feat_name}_diff' : diff,
                        f'{feat_name}_expma' : exp_ma,
                        }).astype(np.float32)
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : calculate_roc(data, 1).values,
                        # f'{feat_name}_diff' : data.diff(1).values,
                        }).astype(np.float32)

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))}).astype(np.float32)
    return df_levels


In [13]:
def calculate_features(df_ticker, postfix=None):
    dfs = [df_ticker]
    
    levels =      [0, 0.005, 0.01, 0.015, 0.02, 0.03]
    levels_tmos = [0, 0.005, 0.01, 0.015, 0.02, 0.03]


    #w1
    df_close = calc_stats_diff_1(df_ticker[f'close{postfix}'], feat_name=f'close{postfix}_w1')
    df_volume = calc_stats_diff_1(df_ticker[f'volume{postfix}'], feat_name=f'volume{postfix}_w1')
    df_tmos_close = calc_stats_diff_1(df_ticker[f'tmos_close{postfix}'], feat_name=f'tmos_close{postfix}_w1')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], 'Error w1'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]

    #w5
    window = 5
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w10
    window = 10
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w20
    window = 20
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w30
    window = 30
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]
    
    #w60
    window = 60
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy()]
    
    #w120
    window = 120
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_tmos_close = calc_stats(df_ticker[f'tmos_close{postfix}'], window=window, feat_name=f'tmos_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_tmos_close_levels = calc_levels(df_ticker[f'tmos_close{postfix}'], window=window, levels=levels_tmos, feat_name=f'tmos_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_tmos_close.shape[0] == df_close_levels.shape[0] == df_tmos_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_tmos_close.copy(), df_close_levels.copy(), df_tmos_close_levels.copy()]

   
    df = pd.concat(dfs, axis=1)
    assert (df_ticker.shape[0] == df.shape[0]) and (df.shape[1] == sum([elem.shape[1] for elem in dfs])), 'Error concat'

    return df


In [None]:
#тест

In [15]:
%%time
ticker = 'YDEX'

df_ticker = load_pkl(f'./data/preproc/1min/{ticker}.pkl')
print(df_ticker.shape)
df_ticker['close'] = df_ticker['close'].astype(np.float32)
df_ticker['volume'] = df_ticker['volume'].astype(np.float32)
df_ticker['tmos_close'] = df_ticker['tmos_close'].astype(np.float32)


df_ticker_fe = calculate_features(df_ticker, postfix='')

(321785, 5)
CPU times: user 16min 44s, sys: 17.5 s, total: 17min 2s
Wall time: 16min 47s


In [87]:
df_ticker_fe

Unnamed: 0,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_ma,close_w5_std,...,tmos_close_w120_lvl_1-1.005,tmos_close_w120_lvl_-0.995-1,tmos_close_w120_lvl_1.005-1.01,tmos_close_w120_lvl_-0.99-0.995,tmos_close_w120_lvl_1.01-1.015,tmos_close_w120_lvl_-0.985-0.99,tmos_close_w120_lvl_1.015-1.02,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98
0,2023-07-03 09:59:00,2456.5,870.0,YDEX,5.50,,,,,,...,,,,,,,,,,
1,2023-07-03 10:00:00,2449.0,2376.0,YDEX,5.51,-0.003053,1.731035,0.001818,,,...,,,,,,,,,,
2,2023-07-03 10:01:00,2448.0,699.0,YDEX,5.51,-0.000408,-0.705808,0.000000,,,...,,,,,,,,,,
3,2023-07-03 10:02:00,2445.5,724.0,YDEX,5.51,-0.001021,0.035765,0.000000,,,...,,,,,,,,,,
4,2023-07-03 10:03:00,2444.0,730.0,YDEX,5.51,-0.000613,0.008287,0.000000,2448.600098,4.839938,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321780,2025-02-24 23:45:00,4664.0,306.0,YDEX,7.33,-0.000107,-0.539850,0.000000,4667.700195,3.154362,...,0.0,29.0,0.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0
321781,2025-02-24 23:46:00,4664.0,318.0,YDEX,7.33,0.000000,0.039216,0.000000,4666.500000,3.201562,...,0.0,29.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
321782,2025-02-24 23:47:00,4664.0,559.0,YDEX,7.33,0.000000,0.757862,0.000000,4665.299805,2.636285,...,0.0,29.0,0.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0
321783,2025-02-24 23:48:00,4663.0,68.0,YDEX,7.32,-0.000214,-0.878354,-0.001364,4663.899902,0.547723,...,4.0,42.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
from joblib import Parallel, delayed
import multiprocessing

def parallel_worker(ticker):
    df_ticker = load_pkl(f'./data/preproc/1min/{ticker}.pkl')
    df_ticker['close'] = df_ticker['close'].astype(np.float32)
    df_ticker['volume'] = df_ticker['volume'].astype(np.float32)
    df_ticker['tmos_close'] = df_ticker['tmos_close'].astype(np.float32)

    
    df_ticker_fe = calculate_features(df_ticker, postfix='')
    
    dump_pkl(df_ticker_fe, f'tmp/{ticker}.pkl')
    


num_cores = 6
res = Parallel(n_jobs=num_cores)(delayed(parallel_worker)(ticker) for ticker in tqdm(stocks))

100%|██████████████████████████████████████████████████████████████████████| 76/76 [4:38:51<00:00, 220.15s/it]


In [92]:
df_tmp = load_pkl(f'tmp/YDEX.pkl')
df_tmp

Unnamed: 0,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_ma,close_w5_std,...,tmos_close_w120_lvl_1-1.005,tmos_close_w120_lvl_-0.995-1,tmos_close_w120_lvl_1.005-1.01,tmos_close_w120_lvl_-0.99-0.995,tmos_close_w120_lvl_1.01-1.015,tmos_close_w120_lvl_-0.985-0.99,tmos_close_w120_lvl_1.015-1.02,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98
0,2023-07-03 09:59:00,2456.5,870.0,YDEX,5.50,,,,,,...,,,,,,,,,,
1,2023-07-03 10:00:00,2449.0,2376.0,YDEX,5.51,-0.003053,1.731035,0.001818,,,...,,,,,,,,,,
2,2023-07-03 10:01:00,2448.0,699.0,YDEX,5.51,-0.000408,-0.705808,0.000000,,,...,,,,,,,,,,
3,2023-07-03 10:02:00,2445.5,724.0,YDEX,5.51,-0.001021,0.035765,0.000000,,,...,,,,,,,,,,
4,2023-07-03 10:03:00,2444.0,730.0,YDEX,5.51,-0.000613,0.008287,0.000000,2448.600098,4.839938,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321780,2025-02-24 23:45:00,4664.0,306.0,YDEX,7.33,-0.000107,-0.539850,0.000000,4667.700195,3.154362,...,0.0,29.0,0.0,89.0,0.0,0.0,0.0,0.0,0.0,0.0
321781,2025-02-24 23:46:00,4664.0,318.0,YDEX,7.33,0.000000,0.039216,0.000000,4666.500000,3.201562,...,0.0,29.0,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0
321782,2025-02-24 23:47:00,4664.0,559.0,YDEX,7.33,0.000000,0.757862,0.000000,4665.299805,2.636285,...,0.0,29.0,0.0,87.0,0.0,0.0,0.0,0.0,0.0,0.0
321783,2025-02-24 23:48:00,4663.0,68.0,YDEX,7.32,-0.000214,-0.878354,-0.001364,4663.899902,0.547723,...,4.0,42.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
77*0.34
# 26 млн 5-минутных данных

26.180000000000003

#### #subsample

In [7]:
coeff_subsample = 0.2

NEED_POINTS = 1

dfs = []
for ticker in stocks:
    df_ticker = load_pkl(f'tmp/{ticker}.pkl')
    df_ticker.reset_index(inplace=True) #чтоб потом таргет навешивать
    print(ticker, df_ticker.shape[0], '->', end=' ')
    
    #1. date available
    count_days = df_ticker['time'].dt.date.nunique()
    mask_avbl = np.zeros(df_ticker.shape[0]).astype(bool)
    if count_days > NEED_POINTS:
        date_first_avbl =  np.sort(df_ticker['time'].dt.date.unique())[NEED_POINTS]
        #print(ticker, date_first_avbl)
        mask_avbl = np.array(df_ticker["time"] >= pd.to_datetime(date_first_avbl))


    #2. subsample
    inds = np.arange(df_ticker.shape[0]).astype(int)[mask_avbl]
    inds_subsample = np.random.RandomState(seed=42).permutation(inds)[:int(len(inds)*coeff_subsample)]

    mask_subsample = np.zeros(df_ticker.shape[0]).astype(bool)
    mask_subsample[inds_subsample] = True
    print(mask_subsample.sum())

    dfs += [df_ticker[mask_subsample].copy()]
print('\nUNION_SHAPE: ', sum([elem.shape[0] for elem in dfs]))

ABIO 199731 -> 39865
AFKS 330901 -> 66023
AFLT 341436 -> 68125
ALRS 342210 -> 68279
APTK 218838 -> 43674
AQUA 271633 -> 54227
BANE 197307 -> 39366
BANEP 255531 -> 51001
BELU 310903 -> 62047
BSPB 291244 -> 58158
CBOM 242240 -> 48323
CHMF 342336 -> 68304
ENPG 290784 -> 58001
FEES 308534 -> 61556
FESH 307027 -> 61274
FLOT 338186 -> 67474
GAZP 346617 -> 69161
GMKN 334268 -> 66695
GTRK 249326 -> 49836
HEAD 219323 -> 43779
HYDR 313201 -> 62493
IRAO 340766 -> 67992
IRKT 218072 -> 43558
KMAZ 222106 -> 44325
LENT 190075 -> 37935
LIFE 171288 -> 34164
LKOH 344826 -> 68803
LSRG 218364 -> 43567
MAGN 343198 -> 68477
MDMG 227111 -> 45324
MGNT 335863 -> 67010
MOEX 340549 -> 67949
MTLR 345346 -> 68911
MTLRP 306883 -> 61266
MTSS 336789 -> 67195
MVID 317640 -> 63380
NLMK 340518 -> 67943
NMTP 259632 -> 51826
NVTK 344774 -> 68793
OGKB 268189 -> 53478
PHOR 314313 -> 62703
PIKK 315359 -> 62924
PLZL 325631 -> 64968
POSI 326907 -> 65238
RASP 230310 -> 45961
RENI 208290 -> 41556
RNFT 309772 -> 61853
ROSN 346767

In [8]:
df = pd.concat(dfs)
df.reset_index(inplace=True, drop=True)

In [11]:
df.shape

(4436580, 301)

In [13]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_ma,...,tmos_close_w120_lvl_1-1.005,tmos_close_w120_lvl_-0.995-1,tmos_close_w120_lvl_1.005-1.01,tmos_close_w120_lvl_-0.99-0.995,tmos_close_w120_lvl_1.01-1.015,tmos_close_w120_lvl_-0.985-0.99,tmos_close_w120_lvl_1.015-1.02,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98
0,409,2023-07-04 10:03:00,105.120003,498.0,ABIO,5.52,-0.00076,0.099338,0.0,105.335999,...,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,411,2023-07-04 10:05:00,104.699997,864.0,ABIO,5.51,-0.002667,-0.711712,-0.001812,105.064003,...,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,417,2023-07-04 10:11:00,104.599998,438.0,ABIO,5.5,-0.004568,1.354839,-0.001815,104.519997,...,35.0,0.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,427,2023-07-04 10:22:00,104.599998,24.0,ABIO,5.5,0.0,7.0,0.0,104.580002,...,37.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,436,2023-07-04 10:31:00,104.300003,336.0,ABIO,5.49,-0.00134,-0.404255,-0.001818,104.424004,...,25.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
pd.set_option('display.max_rows', 77)
df['ticker'].nunique(), df['ticker'].value_counts()

(76,
 ticker
 ROSN     69191
 SBER     69186
 GAZP     69161
 MTLR     68911
 LKOH     68803
 NVTK     68793
 TATN     68790
 VTBR     68498
 MAGN     68477
 CHMF     68304
 ALRS     68279
 AFLT     68125
 IRAO     67992
 MOEX     67949
 NLMK     67943
 FLOT     67474
 SGZH     67356
 RUAL     67317
 MTSS     67195
 SNGSP    67119
 SIBN     67057
 MGNT     67010
 SBERP    67004
 GMKN     66695
 TATNP    66592
 AFKS     66023
 T        65498
 POSI     65238
 SNGS     65219
 SMLT     65085
 PLZL     64968
 VKCO     64481
 RTKM     64265
 YDEX     64195
 MVID     63380
 TRNFP    63174
 PIKK     62924
 PHOR     62703
 SPBE     62599
 HYDR     62493
 BELU     62047
 RNFT     61853
 FEES     61556
 FESH     61274
 MTLRP    61266
 SELG     60081
 UPRO     59563
 TRMK     58349
 BSPB     58158
 ENPG     58001
 WUSH     56614
 UWGN     56471
 AQUA     54227
 OGKB     53478
 RTKMP    52609
 NMTP     51826
 BANEP    51001
 GTRK     49836
 CBOM     48323
 SFIN     48031
 UNAC     46920
 SVAV     4

In [17]:
!mkdir ./data/feat_engin

mkdir: ./data/feat_engin: File exists


In [19]:
dump_pkl(df, './data/feat_engin/df_fe.pkl')

In [21]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'

100%|██████████████████████████████████████████████████████████████████████| 301/301 [00:00<00:00, 426.40it/s]


# #Load data

In [28]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import gc

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [30]:
df = load_pkl('./data/feat_engin/df_fe.pkl')

df.shape

(4436580, 301)

In [31]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_ma,...,tmos_close_w120_lvl_1-1.005,tmos_close_w120_lvl_-0.995-1,tmos_close_w120_lvl_1.005-1.01,tmos_close_w120_lvl_-0.99-0.995,tmos_close_w120_lvl_1.01-1.015,tmos_close_w120_lvl_-0.985-0.99,tmos_close_w120_lvl_1.015-1.02,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98
0,409,2023-07-04 10:03:00,105.120003,498.0,ABIO,5.52,-0.00076,0.099338,0.0,105.335999,...,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,411,2023-07-04 10:05:00,104.699997,864.0,ABIO,5.51,-0.002667,-0.711712,-0.001812,105.064003,...,119.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,417,2023-07-04 10:11:00,104.599998,438.0,ABIO,5.5,-0.004568,1.354839,-0.001815,104.519997,...,35.0,0.0,83.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,427,2023-07-04 10:22:00,104.599998,24.0,ABIO,5.5,0.0,7.0,0.0,104.580002,...,37.0,0.0,76.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,436,2023-07-04 10:31:00,104.300003,336.0,ABIO,5.49,-0.00134,-0.404255,-0.001818,104.424004,...,25.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### time features

In [35]:
#hour
df['hour'] = df['time'].dt.hour

#day
df['day'] = df['time'].dt.day

#day_of_week
df['weekday'] = np.minimum(df['time'].dt.dayofweek, 4) / 4

#month
#df['month'] = df['time'].dt.month


#hour
# time_cyclic = (df['time'] - pd.to_datetime(df['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
# df['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
# df['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
# day_of_week_cyclic = np.minimum(df['time'].dt.dayofweek, 4) / 4
# df['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
# df['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
# day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
# df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
# df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


In [38]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_ma,...,tmos_close_w120_lvl_-0.99-0.995,tmos_close_w120_lvl_1.01-1.015,tmos_close_w120_lvl_-0.985-0.99,tmos_close_w120_lvl_1.015-1.02,tmos_close_w120_lvl_-0.98-0.985,tmos_close_w120_lvl_1.02-1.03,tmos_close_w120_lvl_-0.97-0.98,hour,day,weekday
0,409,2023-07-04 10:03:00,105.120003,498.0,ABIO,5.52,-0.00076,0.099338,0.0,105.335999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.25
1,411,2023-07-04 10:05:00,104.699997,864.0,ABIO,5.51,-0.002667,-0.711712,-0.001812,105.064003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.25
2,417,2023-07-04 10:11:00,104.599998,438.0,ABIO,5.5,-0.004568,1.354839,-0.001815,104.519997,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.25
3,427,2023-07-04 10:22:00,104.599998,24.0,ABIO,5.5,0.0,7.0,0.0,104.580002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.25
4,436,2023-07-04 10:31:00,104.300003,336.0,ABIO,5.49,-0.00134,-0.404255,-0.001818,104.424004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4,0.25


### Absolute value columns

In [43]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False



In [45]:
cols_del_1min = [elem for elem in df.columns if flag_delete(elem)]
len(cols_del_1min), cols_del_1min

(162,
 ['close_w5_ma',
  'close_w5_std',
  'close_w5_ma_low_2std',
  'close_w5_ma_up_2std',
  'close_w5_ma_low_3std',
  'close_w5_ma_up_3std',
  'close_w5_min',
  'close_w5_max',
  'close_w5_expma',
  'volume_w5_ma',
  'volume_w5_std',
  'volume_w5_ma_low_2std',
  'volume_w5_ma_up_2std',
  'volume_w5_ma_low_3std',
  'volume_w5_ma_up_3std',
  'volume_w5_min',
  'volume_w5_max',
  'volume_w5_expma',
  'tmos_close_w5_ma',
  'tmos_close_w5_std',
  'tmos_close_w5_ma_low_2std',
  'tmos_close_w5_ma_up_2std',
  'tmos_close_w5_ma_low_3std',
  'tmos_close_w5_ma_up_3std',
  'tmos_close_w5_min',
  'tmos_close_w5_max',
  'tmos_close_w5_expma',
  'close_w10_ma',
  'close_w10_std',
  'close_w10_ma_low_2std',
  'close_w10_ma_up_2std',
  'close_w10_ma_low_3std',
  'close_w10_ma_up_3std',
  'close_w10_min',
  'close_w10_max',
  'close_w10_expma',
  'volume_w10_ma',
  'volume_w10_std',
  'volume_w10_ma_low_2std',
  'volume_w10_ma_up_2std',
  'volume_w10_ma_low_3std',
  'volume_w10_ma_up_3std',
  'volume_

### Relative features

In [51]:
groups_1min = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w30, w60, w120
        #tmos_close: w1, w5, w10, w20, w30, w60, w120
         ['close_w1_roc', 'close_w5_alpha', 'close_w10_alpha', 'close_w20_alpha', 'close_w30_alpha', 'close_w60_alpha', 'close_w120_alpha'],
         ['volume_w1_roc', 'volume_w5_alpha', 'volume_w10_alpha', 'volume_w20_alpha', 'volume_w30_alpha', 'volume_w60_alpha', 'volume_w120_alpha'],
         ['tmos_close_w1_roc', 'tmos_close_w5_alpha', 'tmos_close_w10_alpha', 'tmos_close_w20_alpha', 'tmos_close_w30_alpha', 'tmos_close_w60_alpha', 'tmos_close_w120_alpha'],
    
    
         ['close_w1_roc', 'close_w5_roc', 'close_w10_roc', 'close_w20_roc', 'close_w30_roc', 'close_w60_roc', 'close_w120_roc'],
         ['volume_w1_roc', 'volume_w5_roc', 'volume_w10_roc', 'volume_w20_roc', 'volume_w30_roc', 'volume_w60_roc', 'volume_w120_roc'],
         ['tmos_close_w1_roc', 'tmos_close_w5_roc', 'tmos_close_w10_roc', 'tmos_close_w20_roc', 'tmos_close_w30_roc', 'tmos_close_w60_roc', 'tmos_close_w120_roc'],

    
         ['close_w5_mean_abs_pct', 'close_w10_mean_abs_pct', 'close_w20_mean_abs_pct', 'close_w30_mean_abs_pct', 'close_w60_mean_abs_pct', 'close_w120_mean_abs_pct'],
         ['volume_w5_mean_abs_pct', 'volume_w10_mean_abs_pct', 'volume_w20_mean_abs_pct', 'volume_w30_mean_abs_pct', 'volume_w60_mean_abs_pct', 'volume_w120_mean_abs_pct'],
         ['tmos_close_w5_mean_abs_pct', 'tmos_close_w10_mean_abs_pct', 'tmos_close_w20_mean_abs_pct', 'tmos_close_w30_mean_abs_pct', 'tmos_close_w60_mean_abs_pct', 'tmos_close_w120_mean_abs_pct'],


         ['close_w5_std', 'close_w10_std', 'close_w20_std', 'close_w30_std', 'close_w60_std', 'close_w120_std'],
         ['volume_w5_std', 'volume_w10_std', 'volume_w20_std', 'volume_w30_std', 'volume_w60_std', 'volume_w120_std'],
         ['tmos_close_w5_std', 'tmos_close_w10_std', 'tmos_close_w20_std', 'tmos_close_w30_std', 'tmos_close_w60_std', 'tmos_close_w120_std'],


         ['close_w5_norm_std', 'close_w10_norm_std', 'close_w20_norm_std', 'close_w30_norm_std', 'close_w60_norm_std', 'close_w120_norm_std'],
         ['volume_w5_norm_std', 'volume_w10_norm_std', 'volume_w20_norm_std', 'volume_w30_norm_std', 'volume_w60_norm_std', 'volume_w120_norm_std'],
         ['tmos_close_w5_norm_std', 'tmos_close_w10_norm_std', 'tmos_close_w20_norm_std', 'tmos_close_w30_norm_std', 'tmos_close_w60_norm_std', 'tmos_close_w120_norm_std'],


         ['close_w5_rsi', 'close_w10_rsi', 'close_w20_rsi', 'close_w30_rsi', 'close_w60_rsi', 'close_w120_rsi'],
         ['volume_w5_rsi', 'volume_w10_rsi', 'volume_w20_rsi', 'volume_w30_rsi', 'volume_w60_rsi', 'volume_w120_rsi'],
         ['tmos_close_w5_rsi', 'tmos_close_w10_rsi', 'tmos_close_w20_rsi', 'tmos_close_w30_rsi', 'tmos_close_w60_rsi', 'tmos_close_w120_rsi'],


         ['close', 'close_w5_ma', 'close_w10_ma', 'close_w20_ma', 'close_w30_ma', 'close_w60_ma', 'close_w120_ma'],
         ['volume', 'volume_w5_ma', 'volume_w10_ma', 'volume_w20_ma', 'volume_w30_ma', 'volume_w60_ma', 'volume_w120_ma'],
         ['tmos_close', 'tmos_close_w5_ma', 'tmos_close_w10_ma', 'tmos_close_w20_ma', 'tmos_close_w30_ma', 'tmos_close_w60_ma', 'tmos_close_w120_ma'],


         ['close', 'close_w5_expma', 'close_w10_expma', 'close_w20_expma', 'close_w30_expma', 'close_w60_expma', 'close_w120_expma'],
         ['volume', 'volume_w5_expma', 'volume_w10_expma', 'volume_w20_expma', 'volume_w30_expma', 'volume_w60_expma', 'volume_w120_expma'],
         ['tmos_close', 'tmos_close_w5_expma', 'tmos_close_w10_expma', 'tmos_close_w20_expma', 'tmos_close_w30_expma', 'tmos_close_w60_expma', 'tmos_close_w120_expma'],


         {'close' : ['close_w5_min', 'close_w10_min', 'close_w20_min', 'close_w30_min', 'close_w60_min', 'close_w120_min']},
         {'volume' : ['volume_w5_min', 'volume_w10_min', 'volume_w20_min', 'volume_w30_min', 'volume_w60_min', 'volume_w120_min']},
         {'tmos_close' : ['tmos_close_w5_min', 'tmos_close_w10_min', 'tmos_close_w20_min', 'tmos_close_w30_min', 'tmos_close_w60_min', 'tmos_close_w120_min']},

    
         {'close' : ['close_w5_max', 'close_w10_max', 'close_w20_max', 'close_w30_max', 'close_w60_max', 'close_w120_max']},
         {'volume' : ['volume_w5_max', 'volume_w10_max', 'volume_w20_max', 'volume_w30_max', 'volume_w60_max', 'volume_w120_max']},
         {'tmos_close' : ['tmos_close_w5_max', 'tmos_close_w10_max', 'tmos_close_w20_max', 'tmos_close_w30_max', 'tmos_close_w60_max', 'tmos_close_w120_max']},



        #w5
         {'close' : [  'close_w5_ma_low_2std', 'close_w5_ma_up_2std', 'close_w5_ma_low_3std', 'close_w5_ma_up_3std']},
         {'volume' : [  'volume_w5_ma_low_2std', 'volume_w5_ma_up_2std', 'volume_w5_ma_low_3std', 'volume_w5_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w5_ma_low_2std', 'tmos_close_w5_ma_up_2std', 'tmos_close_w5_ma_low_3std', 'tmos_close_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_w10_ma_low_2std', 'close_w10_ma_up_2std', 'close_w10_ma_low_3std', 'close_w10_ma_up_3std']},
         {'volume' : [  'volume_w10_ma_low_2std', 'volume_w10_ma_up_2std', 'volume_w10_ma_low_3std', 'volume_w10_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w10_ma_low_2std', 'tmos_close_w10_ma_up_2std', 'tmos_close_w10_ma_low_3std', 'tmos_close_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_w20_ma_low_2std', 'close_w20_ma_up_2std', 'close_w20_ma_low_3std', 'close_w20_ma_up_3std']},
         {'volume' : [  'volume_w20_ma_low_2std', 'volume_w20_ma_up_2std', 'volume_w20_ma_low_3std', 'volume_w20_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w20_ma_low_2std', 'tmos_close_w20_ma_up_2std', 'tmos_close_w20_ma_low_3std', 'tmos_close_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_w30_ma_low_2std', 'close_w30_ma_up_2std', 'close_w30_ma_low_3std', 'close_w30_ma_up_3std']},
         {'volume' : [  'volume_w30_ma_low_2std', 'volume_w30_ma_up_2std', 'volume_w30_ma_low_3std', 'volume_w30_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w30_ma_low_2std', 'tmos_close_w30_ma_up_2std', 'tmos_close_w30_ma_low_3std', 'tmos_close_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_w60_ma_low_2std', 'close_w60_ma_up_2std', 'close_w60_ma_low_3std', 'close_w60_ma_up_3std']},
         {'volume' : [  'volume_w60_ma_low_2std', 'volume_w60_ma_up_2std', 'volume_w60_ma_low_3std', 'volume_w60_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w60_ma_low_2std', 'tmos_close_w60_ma_up_2std', 'tmos_close_w60_ma_low_3std', 'tmos_close_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_w120_ma_low_2std', 'close_w120_ma_up_2std', 'close_w120_ma_low_3std', 'close_w120_ma_up_3std']},
         {'volume' : [  'volume_w120_ma_low_2std', 'volume_w120_ma_up_2std', 'volume_w120_ma_low_3std', 'volume_w120_ma_up_3std']},
         {'tmos_close' : [  'tmos_close_w120_ma_low_2std', 'tmos_close_w120_ma_up_2std', 'tmos_close_w120_ma_low_3std', 'tmos_close_w120_ma_up_3std']},


        #w5
        ['close_w5_min', 'close_w5_max'],
        ['volume_w5_min', 'volume_w5_max'],
        ['tmos_close_w5_min', 'tmos_close_w5_max'],
        #w10
        ['close_w10_min', 'close_w10_max'],
        ['volume_w10_min', 'volume_w10_max'],
        ['tmos_close_w10_min', 'tmos_close_w10_max'],
        #w20
        ['close_w20_min', 'close_w20_max'],
        ['volume_w20_min', 'volume_w20_max'],
        ['tmos_close_w20_min', 'tmos_close_w20_max'],
        #w30
        ['close_w30_min', 'close_w30_max'],
        ['volume_w30_min', 'volume_w30_max'],
        ['tmos_close_w30_min', 'tmos_close_w30_max'],
        #w60
        ['close_w60_min', 'close_w60_max'],
        ['volume_w60_min', 'volume_w60_max'],
        ['tmos_close_w60_min', 'tmos_close_w60_max'],
        #w120
        ['close_w120_min', 'close_w120_max'],
        ['volume_w120_min', 'volume_w120_max'],
        ['tmos_close_w120_min', 'tmos_close_w120_max'],
]

In [54]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = (df[pair[0]] / (df[pair[1]] + np.finfo(np.float32).eps)).astype(np.float32)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = (df[pair0] / (df[pair1] + np.finfo(np.float32).eps)).astype(np.float32)
                

    return df

In [56]:
import warnings
warnings.filterwarnings('ignore')

In [59]:
print(df.shape)
calc_relative_features(df, groups_1min)
df.shape

(4436580, 304)


100%|█████████████████████████████████████████████████████████████████████████| 66/66 [00:08<00:00,  7.83it/s]


(4436580, 862)

### Delete absolute valu columns

In [62]:
len(cols_del_1min)

162

In [63]:
print(df.shape)
for col in cols_del_1min:
    del df[col]
df.shape

(4436580, 862)


(4436580, 700)

In [69]:
# pd.set_option('display.max_rows', 2000)
# df.dtypes

## 2.6 Save data

In [73]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [75]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_1min.pkl')

In [76]:
df.shape

(4436580, 700)

In [77]:
df.head()

Unnamed: 0,index,time,close,volume,ticker,tmos_close,close_w1_roc,volume_w1_roc,tmos_close_w1_roc,close_w5_norm_std,...,tmos_close_w20_min/tmos_close_w20_max,close_w30_min/close_w30_max,volume_w30_min/volume_w30_max,tmos_close_w30_min/tmos_close_w30_max,close_w60_min/close_w60_max,volume_w60_min/volume_w60_max,tmos_close_w60_min/tmos_close_w60_max,close_w120_min/close_w120_max,volume_w120_min/volume_w120_max,tmos_close_w120_min/tmos_close_w120_max
0,409,2023-07-04 10:03:00,105.120003,498.0,ABIO,5.52,-0.00076,0.099338,0.0,0.002112,...,0.998192,0.993567,0.002849,0.998192,0.993567,0.002469,0.998192,0.993567,0.000755,0.998192
1,411,2023-07-04 10:05:00,104.699997,864.0,ABIO,5.51,-0.002667,-0.711712,-0.001812,0.002266,...,0.996383,0.990539,0.001001,0.996383,0.990539,0.001001,0.996383,0.990539,0.000755,0.996383
2,417,2023-07-04 10:11:00,104.599998,438.0,ABIO,5.5,-0.004568,1.354839,-0.001815,0.00397,...,0.994575,0.984863,0.001001,0.994575,0.984863,0.001001,0.994575,0.984863,0.000755,0.994575
3,427,2023-07-04 10:22:00,104.599998,24.0,ABIO,5.5,0.0,7.0,0.0,0.000331,...,0.996377,0.984863,0.001001,0.994575,0.984863,0.001001,0.994575,0.984863,0.000755,0.994575
4,436,2023-07-04 10:31:00,104.300003,336.0,ABIO,5.49,-0.00134,-0.404255,-0.001818,0.001112,...,0.99637,0.988416,0.001001,0.992767,0.984863,0.001001,0.992767,0.984863,0.000755,0.992767


In [55]:
#df.columns.tolist()

In [78]:
no_features = [
 'index',
 'time',
 'close',
 'volume',
 'tmos_close',

 'ticker',

]


# TMP

In [65]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [330]:
df = load_pkl('data/feat_engin/lgbm/data_1min.pkl')

In [None]:
df

#### uniq_1

In [67]:
no_analyze = [
 'time',
 # 'close',
 #'volume',
 #'tmos_close',

 'ticker',

]

In [70]:
# df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].mean().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].mean()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|███████████████████████████████████████████████████████████████████████| 697/697 [00:16<00:00, 41.34it/s]


Unnamed: 0,index,0
0,close,1.154597e+03
1,volume,2.742032e+03
2,tmos_close,6.277404e+00
3,close_w1_roc,5.191222e-07
4,volume_w1_roc,1.134121e+01
...,...,...
692,volume_w60_min/volume_w60_max,5.627767e-03
693,tmos_close_w60_min/tmos_close_w60_max,9.955307e-01
694,close_w120_min/close_w120_max,9.880045e-01
695,volume_w120_min/volume_w120_max,2.587625e-03


In [72]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [74]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

0
0.999999        [close/close_w5_expma, close_w5_expma/close_w2...
0.999999        [close_w10_ma/close_w20_ma, close/close_w10_ex...
1.000000           [close/close_w10_ma, close_w5_ma/close_w20_ma]
1.000000        [close_w5_ma/close_w10_ma, close_w20_ma/close_...
1.000001        [close/close_w20_ma, close_w30_expma/close_w60...
1.000001        [close_w10_ma/close_w30_ma, close_w60_expma/cl...
1.000001        [close_w5_ma/close_w30_ma, tmos_close/tmos_clo...
1.000002        [close_w10_expma/close_w60_expma, tmos_close_w...
1.000002        [tmos_close/tmos_close_w5_ma, close/close_w60_...
1.000002        [close_w30_ma/close_w60_ma, close_w5_expma/clo...
1.000003        [tmos_close/tmos_close_w10_expma, tmos_close_w...
1.000004        [close_w20_ma/close_w60_ma, tmos_close/tmos_cl...
1.000007        [close_w10_expma/close_w120_expma, tmos_close/...
1.000023        [tmos_close/tmos_close_w60_expma, tmos_close_w...
16639.886719    [tmos_close_w20_rsi/tmos_close_w120_rsi, tmos_...
Name: in

In [76]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].values

array([array(['close/close_w5_expma', 'close_w5_expma/close_w20_expma'],
             dtype=object)                                              ,
       array(['close_w10_ma/close_w20_ma', 'close/close_w10_expma',
              'close/close_w20_expma', 'close_w10_expma/close_w20_expma'],
             dtype=object)                                                ,
       array(['close/close_w10_ma', 'close_w5_ma/close_w20_ma'], dtype=object),
       array(['close_w5_ma/close_w10_ma', 'close_w20_ma/close_w30_ma',
              'close_w5_expma/close_w30_expma',
              'close_w10_expma/close_w30_expma'], dtype=object)       ,
       array(['close/close_w20_ma', 'close_w30_expma/close_w60_expma'],
             dtype=object)                                             ,
       array(['close_w10_ma/close_w30_ma', 'close_w60_expma/close_w120_expma'],
             dtype=object)                                                     ,
       array(['close_w5_ma/close_w30_ma', 'tmos_close/tmo

In [None]:
#Это странно, но числа в разном порядке идут

In [96]:
df[['tmos_close_w20_rsi/tmos_close_w120_rsi',
              'tmos_close_w30_rsi/tmos_close_w120_rsi',
              'tmos_close_w60_rsi/tmos_close_w120_rsi']]

Unnamed: 0,tmos_close_w20_rsi/tmos_close_w120_rsi,tmos_close_w30_rsi/tmos_close_w120_rsi,tmos_close_w60_rsi/tmos_close_w120_rsi
0,0.857143,0.909091,1.000000
1,0.781254,0.833337,1.000001
2,0.708336,0.772730,0.923914
3,0.772730,0.850002,0.986608
4,0.866666,0.773811,0.902779
...,...,...,...
4436575,1.028571,1.015037,0.997536
4436576,1.021223,1.098005,1.011494
4436577,1.244448,1.098040,1.037037
4436578,1.100000,1.100000,1.055556


In [98]:
df[['tmos_close_w20_rsi/tmos_close_w120_rsi',
              'tmos_close_w30_rsi/tmos_close_w120_rsi',
              'tmos_close_w60_rsi/tmos_close_w120_rsi']].mean()

tmos_close_w20_rsi/tmos_close_w120_rsi    16639.886719
tmos_close_w30_rsi/tmos_close_w120_rsi    16639.886719
tmos_close_w60_rsi/tmos_close_w120_rsi    16639.886719
dtype: float32

In [100]:
pd.set_option('display.max_rows', 100)
df['tmos_close_w20_rsi/tmos_close_w120_rsi'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.01).tolist()+[0.99, 1])

0.00    0.000000e+00
0.01    6.153792e-01
0.10    8.406607e-01
0.11    8.518515e-01
0.12    8.571429e-01
0.13    8.641976e-01
0.14    8.724490e-01
0.15    8.757767e-01
0.16    8.833299e-01
0.17    8.888888e-01
0.18    8.914270e-01
0.19    8.999982e-01
0.20    9.053531e-01
0.21    9.090896e-01
0.22    9.104934e-01
0.23    9.189193e-01
0.24    9.230781e-01
0.25    9.265734e-01
0.26    9.318178e-01
0.27    9.374997e-01
0.28    9.414409e-01
0.29    9.464283e-01
0.30    9.503091e-01
0.31    9.545459e-01
0.32    9.583336e-01
0.33    9.615388e-01
0.34    9.650334e-01
0.35    9.683819e-01
0.36    9.722217e-01
0.37    9.749998e-01
0.38    9.772721e-01
0.39    9.782609e-01
0.40    9.791673e-01
0.41    9.807689e-01
0.42    9.814816e-01
0.43    9.827583e-01
0.44    9.838710e-01
0.45    9.866654e-01
0.46    1.000000e+00
0.47    1.000000e+00
0.48    1.000000e+00
0.49    1.000000e+00
0.50    1.000000e+00
0.51    1.000000e+00
0.52    1.000000e+00
0.53    1.000000e+00
0.54    1.000000e+00
0.55    1.000

In [92]:
pd.set_option('display.max_rows', 100)
df['close/close_w5_expma'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.01).tolist()+[0.99, 1])

0.00    0.782768
0.01    0.997334
0.10    0.999233
0.11    0.999289
0.12    0.999339
0.13    0.999384
0.14    0.999426
0.15    0.999463
0.16    0.999497
0.17    0.999529
0.18    0.999558
0.19    0.999586
0.20    0.999611
0.21    0.999635
0.22    0.999658
0.23    0.999679
0.24    0.999700
0.25    0.999719
0.26    0.999737
0.27    0.999754
0.28    0.999771
0.29    0.999787
0.30    0.999802
0.31    0.999816
0.32    0.999830
0.33    0.999843
0.34    0.999856
0.35    0.999868
0.36    0.999880
0.37    0.999891
0.38    0.999903
0.39    0.999913
0.40    0.999924
0.41    0.999934
0.42    0.999943
0.43    0.999953
0.44    0.999962
0.45    0.999972
0.46    0.999981
0.47    0.999990
0.48    1.000000
0.49    1.000000
0.50    1.000002
0.51    1.000013
0.52    1.000021
0.53    1.000029
0.54    1.000037
0.55    1.000045
0.56    1.000054
0.57    1.000062
0.58    1.000071
0.59    1.000080
0.60    1.000089
0.61    1.000099
0.62    1.000109
0.63    1.000119
0.64    1.000129
0.65    1.000140
0.66    1.0001

#### uniq_2

In [84]:
#df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].nunique().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].nunique()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|███████████████████████████████████████████████████████████████████████| 697/697 [01:44<00:00,  6.66it/s]


Unnamed: 0,index,0
0,close,130464
1,volume,77176
2,tmos_close,216
3,close_w1_roc,1001502
4,volume_w1_roc,1464504
...,...,...
692,volume_w60_min/volume_w60_max,473208
693,tmos_close_w60_min/tmos_close_w60_max,2539
694,close_w120_min/close_w120_max,418194
695,volume_w120_min/volume_w120_max,297245


In [102]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [104]:
pd.set_option('display.max_rows', 120)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,30,"[close_w30_lvl_1-1.005, close_w30_lvl_-0.995-1..."
1,119,"[tmos_close_w120_lvl_1.01-1.015, tmos_close_w1..."
2,120,"[close_w120_lvl_1-1.005, close_w120_lvl_-0.995..."


In [106]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index().values

array([[30,
        array(['close_w30_lvl_1-1.005', 'close_w30_lvl_-0.995-1',
               'close_w30_lvl_1.005-1.01', 'close_w30_lvl_-0.99-0.995',
               'close_w30_lvl_1.01-1.015', 'close_w30_lvl_-0.985-0.99',
               'close_w30_lvl_1.015-1.02', 'close_w30_lvl_-0.98-0.985',
               'close_w30_lvl_1.02-1.03', 'close_w30_lvl_-0.97-0.98',
               'tmos_close_w30_lvl_1-1.005', 'tmos_close_w30_lvl_-0.995-1',
               'tmos_close_w30_lvl_1.005-1.01', 'tmos_close_w30_lvl_-0.99-0.995',
               'tmos_close_w30_lvl_1.01-1.015', 'tmos_close_w30_lvl_-0.985-0.99',
               'tmos_close_w30_lvl_-0.98-0.985'], dtype=object)                  ],
       [119, array(['tmos_close_w120_lvl_1.01-1.015',
                    'tmos_close_w120_lvl_-0.98-0.985'], dtype=object)],
       [120,
        array(['close_w120_lvl_1-1.005', 'close_w120_lvl_-0.995-1',
               'close_w120_lvl_1.005-1.01', 'close_w120_lvl_-0.99-0.995',
               'close_w120_lvl_

In [108]:
i = 1
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(119,
 array(['tmos_close_w120_lvl_1.01-1.015',
        'tmos_close_w120_lvl_-0.98-0.985'], dtype=object))

In [80]:
np.sort(df['close_w120_mean_abs_pct'].unique())

array([4.6989604e-05, 4.8031005e-05, 4.9073846e-05, ..., 3.0944357e-03,
       3.1055172e-03,           nan], dtype=float32)