In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [3]:
stocks = [#'ETH-USDT',
         'ETC-USDT',
         'XRP-USDT',
         'BNB-USDT',
         'SOL-USDT',
         'DOGE-USDT',
         'ADA-USDT',
         'TRX-USDT',
         'LINK-USDT', 
         'AVAX-USDT', 
         'SUI-USDT',
         'XLM-USDT',
         'LTC-USDT',
         'TON-USDT',
         #'SHIB-USDT',
         'HBAR-USDT',
         'DOT-USDT',
         #'BCH-USDT',
         'OP-USDT', 
         'NEAR-USDT',
         'AAVE-USDT',
         #'LDO-USDT',
         'ARB-USDT',
         'UNI-USDT',
         #'ATOM-USDT', 
         
         ]
len(stocks)

20

### 1. Load data

In [7]:
dfs_5min = []
dfs_1hour = []
dfs_1day = []

#stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):

    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_5min.shape, df_1hour.shape, df_1day.shape

100%|████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 136.90it/s]


((5366714, 5), (447227, 5), (18636, 5))

In [8]:
df_5min.head()

Unnamed: 0,time,close,volume,ticker,btc_close
0,2022-07-01 00:00:00,14.484,2683.213237,ETC-USDT,18811.0
1,2022-07-01 00:05:00,14.449,802.03137,ETC-USDT,18820.2
2,2022-07-01 00:10:00,14.493,978.818765,ETC-USDT,18893.8
3,2022-07-01 00:15:00,14.51,332.343304,ETC-USDT,18892.8
4,2022-07-01 00:20:00,14.484,446.75158,ETC-USDT,18889.1


In [11]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,btc_close
0,2022-07-01 00:00:00,14.39,6308.898013,ETC-USDT,18821.5
1,2022-07-01 01:00:00,14.501,7351.315191,ETC-USDT,18895.6
2,2022-07-01 02:00:00,15.016,18704.233055,ETC-USDT,19945.6
3,2022-07-01 03:00:00,15.256,51126.028345,ETC-USDT,20287.5
4,2022-07-01 04:00:00,15.223,12474.278715,ETC-USDT,20414.5


In [13]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,btc_close
0,2022-07-01 03:00:00,14.638,283572.166695,ETC-USDT,19276.5
1,2022-07-02 03:00:00,14.77,156729.551346,ETC-USDT,19251.0
2,2022-07-03 03:00:00,14.809,163387.245679,ETC-USDT,19317.0
3,2022-07-04 03:00:00,15.401,332904.831234,ETC-USDT,20235.9
4,2022-07-05 03:00:00,14.924,290508.68466,ETC-USDT,20177.8


### 2. Preproc data

#### 2.1 Make target

In [19]:
def get_target(df_all, ind, val_first, val_second, points_to_wait):
    #цель - достичь val_second, не достигнув val_first

    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]

    start_price = df['close'].iloc[0]
    
    if val_first < val_second:                                 
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else: #short game
        mask_val_first = np.array(df['close'] > val_first)
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):

        ind_end = df.shape[0]-1
        delta_time = df['time'].iloc[ind_end] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_end]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'DNF', delta_time, income_rate, res_price, ind_end+ind
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_second]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        res_price = df['close'].iloc[ind_val_first]
        if val_first < val_second:
            income_rate = res_price/start_price
        else: #short game
            income_rate = (2*start_price-res_price)/start_price
        return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_first]
            if val_first < val_second:
                income_rate = res_price/start_price
            else: #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'LOSE', delta_time, income_rate, res_price, ind_val_first+ind
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            res_price = df['close'].iloc[ind_val_second]
            if val_first < val_second:
                income_rate = res_price/start_price
            else:  #short game
                income_rate = (2*start_price-res_price)/start_price
            return 'WIN', delta_time, income_rate, res_price, ind_val_second+ind


def get_df_target(df, indx, percent_first=None, percent_second=None, points_to_wait=None):
    times = []
    results = []
    delta_times = []
    income_rates = []
    closes = []
    tickers = []
    res_prices = []
    res_inds = []
    
    #for ind in indx:
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time, income_rate, res_price, res_ind = get_target(df, ind, val_first, val_second, points_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        income_rates += [income_rate]
        res_prices += [res_price]
        res_inds += [res_ind]
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times,
                              'income_rate' : income_rates,
                              'res_price' : res_prices,
                              'res_ind' : res_inds
                             })

    #макс выигрыш и потери
    if percent_first < percent_second:  
        df_result["income_rate"] = np.maximum(df_result["income_rate"], percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], percent_second) #макс выигрыш
    else:
        df_result["income_rate"] = np.maximum(df_result["income_rate"], 2-percent_first) #макс потери
        df_result["income_rate"] = np.minimum(df_result["income_rate"], 2-percent_second) #макс выигрыш
    
    df_result['income_rate'] -= 0.002
    
    return df_result    

In [23]:
inds = np.arange(df_5min.shape[0])
inds.shape

(5366714,)

In [25]:
df_result = get_df_target(df_5min, inds, percent_first=0.99, percent_second=1.03, points_to_wait=4*12*1)

100%|█████████████████████████████████████████████████████████████| 5366714/5366714 [22:22<00:00, 3998.84it/s]


In [26]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 00:00:00,14.484,WIN,ETC-USDT,0 days 02:55:00,1.028000,15.016,35
1,1,2022-07-01 00:05:00,14.449,WIN,ETC-USDT,0 days 02:45:00,1.028000,14.885,34
2,2,2022-07-01 00:10:00,14.493,WIN,ETC-USDT,0 days 02:45:00,1.028000,15.016,35
3,3,2022-07-01 00:15:00,14.510,LOSE,ETC-USDT,0 days 00:50:00,0.988000,14.354,13
4,4,2022-07-01 00:20:00,14.484,WIN,ETC-USDT,0 days 02:35:00,1.028000,15.016,35
...,...,...,...,...,...,...,...,...,...
5366709,5366709,2025-02-23 23:35:00,8.975,DNF,UNI-USDT,0 days 00:20:00,1.001900,9.010,5366713
5366710,5366710,2025-02-23 23:40:00,8.993,DNF,UNI-USDT,0 days 00:15:00,0.999890,9.010,5366713
5366711,5366711,2025-02-23 23:45:00,8.989,DNF,UNI-USDT,0 days 00:10:00,1.000336,9.010,5366713
5366712,5366712,2025-02-23 23:50:00,9.001,DNF,UNI-USDT,0 days 00:05:00,0.999000,9.010,5366713


In [27]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.560318
LOSE    0.373945
WIN     0.065737
Name: proportion, dtype: float64

In [29]:
df_result['income_rate'].quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])

0.00    0.988000
0.01    0.988000
0.10    0.988000
0.20    0.988000
0.30    0.988000
0.40    0.991522
0.50    0.996147
0.60    0.999226
0.70    1.002455
0.80    1.006871
0.90    1.015849
0.99    1.028000
1.00    1.028000
Name: income_rate, dtype: float64

In [31]:
#распределение времени в часах
(df_result['delta_time']/pd.Timedelta('1 hour')).quantile(q=[0, 0.01]+np.arange(0.1, 1, 0.1).tolist()+[0.99, 1])



0.00    0.000000
0.01    0.166667
0.10    0.750000
0.20    1.416667
0.30    2.333333
0.40    3.500000
0.50    4.000000
0.60    4.000000
0.70    4.000000
0.80    4.000000
0.90    4.000000
0.99    4.000000
1.00    4.000000
Name: delta_time, dtype: float64

In [33]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [37]:
!mkdir data/feat_engin

mkdir: data/feat_engin: File exists


In [38]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [39]:
dump_pkl(df_result, './data/feat_engin/lgbm/df_result_-1_+3_4hour.pkl')

#### 2.1.2 Загрузим датасет с таргетами

In [43]:
df_result = load_pkl('./data/feat_engin/lgbm/df_result_-1_+3_4hour.pkl')

In [44]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 00:00:00,14.484,WIN,ETC-USDT,0 days 02:55:00,1.028000,15.016,35
1,1,2022-07-01 00:05:00,14.449,WIN,ETC-USDT,0 days 02:45:00,1.028000,14.885,34
2,2,2022-07-01 00:10:00,14.493,WIN,ETC-USDT,0 days 02:45:00,1.028000,15.016,35
3,3,2022-07-01 00:15:00,14.510,LOSE,ETC-USDT,0 days 00:50:00,0.988000,14.354,13
4,4,2022-07-01 00:20:00,14.484,WIN,ETC-USDT,0 days 02:35:00,1.028000,15.016,35
...,...,...,...,...,...,...,...,...,...
5366709,5366709,2025-02-23 23:35:00,8.975,DNF,UNI-USDT,0 days 00:20:00,1.001900,9.010,5366713
5366710,5366710,2025-02-23 23:40:00,8.993,DNF,UNI-USDT,0 days 00:15:00,0.999890,9.010,5366713
5366711,5366711,2025-02-23 23:45:00,8.989,DNF,UNI-USDT,0 days 00:10:00,1.000336,9.010,5366713
5366712,5366712,2025-02-23 23:50:00,9.001,DNF,UNI-USDT,0 days 00:05:00,0.999000,9.010,5366713


In [45]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.560318
LOSE    0.373945
WIN     0.065737
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [48]:
dfs_5min = []
dfs_1hour = []
dfs_1day = []

#stocks = [elem.split('.')[0] for elem in os.listdir('./data/preproc/1hour/')]

for stock in tqdm(stocks):
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")

    #time_index 1hour
    df_5min['date_hour_index'] = (df_5min['time'] - pd.Timedelta(minutes=55)).dt.floor('h')
    
    #time_index 1day
    df_5min['date_day_index'] = (df_5min['time'] - pd.Timedelta('26 hours 55 minutes')).dt.floor('d') + pd.Timedelta(hours=3)

    
 
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]


df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)


df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)


df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 82.88it/s]


((5366714, 7), (447227, 5), (18636, 5))

In [49]:
#проверка глазами
i = 68000+13*6 + 12*24*90
df_5min.iloc[i:i+20]

Unnamed: 0,time,close,volume,ticker,btc_close,date_hour_index,date_day_index
93998,2023-05-23 09:10:00,18.429,4.743609,ETC-USDT,27395.6,2023-05-23 08:00:00,2023-05-22 03:00:00
93999,2023-05-23 09:15:00,18.429,82.908802,ETC-USDT,27402.2,2023-05-23 08:00:00,2023-05-22 03:00:00
94000,2023-05-23 09:20:00,18.407,17.464024,ETC-USDT,27370.5,2023-05-23 08:00:00,2023-05-22 03:00:00
94001,2023-05-23 09:25:00,18.393,59.055052,ETC-USDT,27356.0,2023-05-23 08:00:00,2023-05-22 03:00:00
94002,2023-05-23 09:30:00,18.374,234.786715,ETC-USDT,27328.2,2023-05-23 08:00:00,2023-05-22 03:00:00
94003,2023-05-23 09:35:00,18.332,391.418062,ETC-USDT,27292.1,2023-05-23 08:00:00,2023-05-22 03:00:00
94004,2023-05-23 09:40:00,18.373,365.120012,ETC-USDT,27332.3,2023-05-23 08:00:00,2023-05-22 03:00:00
94005,2023-05-23 09:45:00,18.334,349.835403,ETC-USDT,27295.6,2023-05-23 08:00:00,2023-05-22 03:00:00
94006,2023-05-23 09:50:00,18.36,164.079758,ETC-USDT,27296.4,2023-05-23 08:00:00,2023-05-22 03:00:00
94007,2023-05-23 09:55:00,18.358,387.380293,ETC-USDT,27295.9,2023-05-23 09:00:00,2023-05-22 03:00:00


In [50]:
df_1hour.head()

Unnamed: 0,time,close,volume,ticker,btc_close
0,2022-07-01 00:00:00,14.39,6308.898013,ETC-USDT,18821.5
1,2022-07-01 01:00:00,14.501,7351.315191,ETC-USDT,18895.6
2,2022-07-01 02:00:00,15.016,18704.233055,ETC-USDT,19945.6
3,2022-07-01 03:00:00,15.256,51126.028345,ETC-USDT,20287.5
4,2022-07-01 04:00:00,15.223,12474.278715,ETC-USDT,20414.5


In [51]:
df_1day.head()

Unnamed: 0,time,close,volume,ticker,btc_close
0,2022-07-01 03:00:00,14.638,283572.166695,ETC-USDT,19276.5
1,2022-07-02 03:00:00,14.77,156729.551346,ETC-USDT,19251.0
2,2022-07-03 03:00:00,14.809,163387.245679,ETC-USDT,19317.0
3,2022-07-04 03:00:00,15.401,332904.831234,ETC-USDT,20235.9
4,2022-07-05 03:00:00,14.924,290508.68466,ETC-USDT,20177.8


In [56]:
#link different time-step date

df_1hour.reset_index(inplace=True)
df_1hour = df_1hour.rename(columns={col : col+'_1hour' for col in df_1hour.columns if col not in ['ticker']})
df = df_5min.merge(df_1hour, left_on=['date_hour_index', 'ticker'], right_on=['time_1hour', 'ticker'], how='left')

df_1day.reset_index(inplace=True)
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['ticker']})
df = df.merge(df_1day, left_on=['date_day_index', 'ticker'], right_on=['time_1day', 'ticker'], how='left')

#Проверка что не наджоинилось лишнего
assert df_5min.shape[0] == df.shape[0], 'Error: with join dimensions'

In [57]:
df

Unnamed: 0,time,close,volume,ticker,btc_close,date_hour_index,date_day_index,index_1hour,time_1hour,close_1hour,volume_1hour,btc_close_1hour,index_1day,time_1day,close_1day,volume_1day,btc_close_1day
0,2022-07-01 00:00:00,14.484,2683.213237,ETC-USDT,18811.0,2022-06-30 23:00:00,2022-06-29 03:00:00,,NaT,,,,,NaT,,,
1,2022-07-01 00:05:00,14.449,802.031370,ETC-USDT,18820.2,2022-06-30 23:00:00,2022-06-29 03:00:00,,NaT,,,,,NaT,,,
2,2022-07-01 00:10:00,14.493,978.818765,ETC-USDT,18893.8,2022-06-30 23:00:00,2022-06-29 03:00:00,,NaT,,,,,NaT,,,
3,2022-07-01 00:15:00,14.510,332.343304,ETC-USDT,18892.8,2022-06-30 23:00:00,2022-06-29 03:00:00,,NaT,,,,,NaT,,,
4,2022-07-01 00:20:00,14.484,446.751580,ETC-USDT,18889.1,2022-06-30 23:00:00,2022-06-29 03:00:00,,NaT,,,,,NaT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5366709,2025-02-23 23:35:00,8.975,315.805983,UNI-USDT,95722.0,2025-02-23 22:00:00,2025-02-22 03:00:00,447225.0,2025-02-23 22:00:00,8.988,6423.965756,95763.9,18634.0,2025-02-22 03:00:00,9.042,983057.680085,96552.1
5366710,2025-02-23 23:40:00,8.993,34.203667,UNI-USDT,95751.1,2025-02-23 22:00:00,2025-02-22 03:00:00,447225.0,2025-02-23 22:00:00,8.988,6423.965756,95763.9,18634.0,2025-02-22 03:00:00,9.042,983057.680085,96552.1
5366711,2025-02-23 23:45:00,8.989,174.232575,UNI-USDT,95761.8,2025-02-23 22:00:00,2025-02-22 03:00:00,447225.0,2025-02-23 22:00:00,8.988,6423.965756,95763.9,18634.0,2025-02-22 03:00:00,9.042,983057.680085,96552.1
5366712,2025-02-23 23:50:00,9.001,219.996489,UNI-USDT,95796.0,2025-02-23 22:00:00,2025-02-22 03:00:00,447225.0,2025-02-23 22:00:00,8.988,6423.965756,95763.9,18634.0,2025-02-22 03:00:00,9.042,983057.680085,96552.1


In [60]:
#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1hour) к 5min (от 1hour): {df.loc[df['close_1hour'].isnull(), ['ticker', 'date_hour_index']].groupby(['ticker', 'date_hour_index']).count().shape[0]}  акций-часов')

#Проверка, что данные все данные подтянулись
print(f'Не подтянулсиь пар (ticker, time_1day) к 5min (от 1day): {df.loc[df['close_1day'].isnull(), ['ticker', 'date_day_index']].groupby(['ticker', 'date_day_index']).count().shape[0]}  акций-дней')



Не подтянулсиь пар (ticker, time_1hour) к 5min (от 1hour): 20  акций-часов
Не подтянулсиь пар (ticker, time_1day) к 5min (от 1day): 37  акций-дней


In [61]:
#из-за того, что не все акции начинаются в 00:00, есть  запущенные позже чем все остальные

In [62]:
df.loc[df['close_1day'].isnull(), ['ticker', 'date_day_index']].groupby(['ticker', 'date_day_index']).count()

ticker,date_day_index
AAVE-USDT,2022-06-29 03:00:00
AAVE-USDT,2022-06-30 03:00:00
ADA-USDT,2022-06-29 03:00:00
ADA-USDT,2022-06-30 03:00:00
ARB-USDT,2023-03-22 03:00:00
AVAX-USDT,2022-06-29 03:00:00
AVAX-USDT,2022-06-30 03:00:00
BNB-USDT,2022-12-20 03:00:00
DOGE-USDT,2022-06-29 03:00:00
DOGE-USDT,2022-06-30 03:00:00


In [63]:
df['close_1hour'].isnull().mean(), df['close_1day'].isnull().mean()

(3.913008966007877e-05, 0.001119120564278253)

In [64]:
df['ticker'].nunique()

20

In [65]:
df = df[['time', 'close', 'volume', 'ticker', 'btc_close', 'index_1hour', 'index_1day']]
df

Unnamed: 0,time,close,volume,ticker,btc_close,index_1hour,index_1day
0,2022-07-01 00:00:00,14.484,2683.213237,ETC-USDT,18811.0,,
1,2022-07-01 00:05:00,14.449,802.031370,ETC-USDT,18820.2,,
2,2022-07-01 00:10:00,14.493,978.818765,ETC-USDT,18893.8,,
3,2022-07-01 00:15:00,14.510,332.343304,ETC-USDT,18892.8,,
4,2022-07-01 00:20:00,14.484,446.751580,ETC-USDT,18889.1,,
...,...,...,...,...,...,...,...
5366709,2025-02-23 23:35:00,8.975,315.805983,UNI-USDT,95722.0,447225.0,18634.0
5366710,2025-02-23 23:40:00,8.993,34.203667,UNI-USDT,95751.1,447225.0,18634.0
5366711,2025-02-23 23:45:00,8.989,174.232575,UNI-USDT,95761.8,447225.0,18634.0
5366712,2025-02-23 23:50:00,9.001,219.996489,UNI-USDT,95796.0,447225.0,18634.0


#### 2.4 Union target and features. Make data_file to train

In [68]:
(df_result['time'] == df['time']).all(), (df_result['close'] == df['close']).all()

(True, True)

In [69]:
(df.index.values == df_result.index.values).all()

True

In [70]:
df_result.head()

Unnamed: 0,ind,time,close,result,ticker,delta_time,income_rate,res_price,res_ind
0,0,2022-07-01 00:00:00,14.484,WIN,ETC-USDT,0 days 02:55:00,1.028,15.016,35
1,1,2022-07-01 00:05:00,14.449,WIN,ETC-USDT,0 days 02:45:00,1.028,14.885,34
2,2,2022-07-01 00:10:00,14.493,WIN,ETC-USDT,0 days 02:45:00,1.028,15.016,35
3,3,2022-07-01 00:15:00,14.51,LOSE,ETC-USDT,0 days 00:50:00,0.988,14.354,13
4,4,2022-07-01 00:20:00,14.484,WIN,ETC-USDT,0 days 02:35:00,1.028,15.016,35


In [71]:
df_result.columns.tolist()

['ind',
 'time',
 'close',
 'result',
 'ticker',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind']

In [72]:
#union
df = pd.concat([df.reset_index(drop=True), df_result[['result', 'delta_time', 'income_rate', 'res_price', 'res_ind']].reset_index(drop=True)], axis=1)
df

Unnamed: 0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-07-01 00:00:00,14.484,2683.213237,ETC-USDT,18811.0,,,WIN,0 days 02:55:00,1.028000,15.016,35
1,2022-07-01 00:05:00,14.449,802.031370,ETC-USDT,18820.2,,,WIN,0 days 02:45:00,1.028000,14.885,34
2,2022-07-01 00:10:00,14.493,978.818765,ETC-USDT,18893.8,,,WIN,0 days 02:45:00,1.028000,15.016,35
3,2022-07-01 00:15:00,14.510,332.343304,ETC-USDT,18892.8,,,LOSE,0 days 00:50:00,0.988000,14.354,13
4,2022-07-01 00:20:00,14.484,446.751580,ETC-USDT,18889.1,,,WIN,0 days 02:35:00,1.028000,15.016,35
...,...,...,...,...,...,...,...,...,...,...,...,...
5366709,2025-02-23 23:35:00,8.975,315.805983,UNI-USDT,95722.0,447225.0,18634.0,DNF,0 days 00:20:00,1.001900,9.010,5366713
5366710,2025-02-23 23:40:00,8.993,34.203667,UNI-USDT,95751.1,447225.0,18634.0,DNF,0 days 00:15:00,0.999890,9.010,5366713
5366711,2025-02-23 23:45:00,8.989,174.232575,UNI-USDT,95761.8,447225.0,18634.0,DNF,0 days 00:10:00,1.000336,9.010,5366713
5366712,2025-02-23 23:50:00,9.001,219.996489,UNI-USDT,95796.0,447225.0,18634.0,DNF,0 days 00:05:00,0.999000,9.010,5366713


### 2.5 Feature engineering

In [77]:
df

Unnamed: 0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,res_price,res_ind
0,2022-07-01 00:00:00,14.484,2683.213237,ETC-USDT,18811.0,,,WIN,0 days 02:55:00,1.028000,15.016,35
1,2022-07-01 00:05:00,14.449,802.031370,ETC-USDT,18820.2,,,WIN,0 days 02:45:00,1.028000,14.885,34
2,2022-07-01 00:10:00,14.493,978.818765,ETC-USDT,18893.8,,,WIN,0 days 02:45:00,1.028000,15.016,35
3,2022-07-01 00:15:00,14.510,332.343304,ETC-USDT,18892.8,,,LOSE,0 days 00:50:00,0.988000,14.354,13
4,2022-07-01 00:20:00,14.484,446.751580,ETC-USDT,18889.1,,,WIN,0 days 02:35:00,1.028000,15.016,35
...,...,...,...,...,...,...,...,...,...,...,...,...
5366709,2025-02-23 23:35:00,8.975,315.805983,UNI-USDT,95722.0,447225.0,18634.0,DNF,0 days 00:20:00,1.001900,9.010,5366713
5366710,2025-02-23 23:40:00,8.993,34.203667,UNI-USDT,95751.1,447225.0,18634.0,DNF,0 days 00:15:00,0.999890,9.010,5366713
5366711,2025-02-23 23:45:00,8.989,174.232575,UNI-USDT,95761.8,447225.0,18634.0,DNF,0 days 00:10:00,1.000336,9.010,5366713
5366712,2025-02-23 23:50:00,9.001,219.996489,UNI-USDT,95796.0,447225.0,18634.0,DNF,0 days 00:05:00,0.999000,9.010,5366713


In [78]:
df_1hour.head()

Unnamed: 0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,btc_close_1hour
0,0,2022-07-01 00:00:00,14.39,6308.898013,ETC-USDT,18821.5
1,1,2022-07-01 01:00:00,14.501,7351.315191,ETC-USDT,18895.6
2,2,2022-07-01 02:00:00,15.016,18704.233055,ETC-USDT,19945.6
3,3,2022-07-01 03:00:00,15.256,51126.028345,ETC-USDT,20287.5
4,4,2022-07-01 04:00:00,15.223,12474.278715,ETC-USDT,20414.5


In [79]:
df_1day.head()

Unnamed: 0,index_1day,time_1day,close_1day,volume_1day,ticker,btc_close_1day
0,0,2022-07-01 03:00:00,14.638,283572.166695,ETC-USDT,19276.5
1,1,2022-07-02 03:00:00,14.77,156729.551346,ETC-USDT,19251.0
2,2,2022-07-03 03:00:00,14.809,163387.245679,ETC-USDT,19317.0
3,3,2022-07-04 03:00:00,15.401,332904.831234,ETC-USDT,20235.9
4,4,2022-07-05 03:00:00,14.924,290508.68466,ETC-USDT,20177.8


In [82]:
from sklearn.linear_model import LinearRegression

def calculate_exp_ma(data, window):
    alpha = 2 / (window + 1)
    coeffs = ((1 - alpha)**(np.arange(window)[::-1])) * (alpha)
    coeffs[0] /= alpha
    
    return data.rolling(window=window, min_periods=window).apply(lambda x: (x*coeffs).sum()).values


def calculate_bollinger_bands(data, window):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window, min_periods=window).mean().values
    rolling_std = data.rolling(window=window, min_periods=window).std().values
    norm_rolling_std = rolling_std / (rolling_mean + np.finfo(np.float32).eps)

    num_of_std = 2
    lower_band_2std = rolling_mean - (rolling_std * num_of_std)
    upper_band_2std = rolling_mean + (rolling_std * num_of_std)
    
    num_of_std = 3
    lower_band_3std = rolling_mean - (rolling_std * num_of_std)
    upper_band_3std = rolling_mean + (rolling_std * num_of_std)
    
    
    return rolling_mean, rolling_std, norm_rolling_std, lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std

def calculate_rsi(data, window):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    mask = avg_loss == 0
    rsi[mask] = 100
    
    return rsi.values

def calculate_roc(data, periods):
    """Calculate Rate of Change"""
    roc = (data - data.shift(periods)) / (data.shift(periods)+np.finfo(np.float32).eps)
    return roc



def calc_stats(data, window=None, feat_name=None):
    #mean, std
    rolling_mean, rolling_std, norm_rolling_std,\
    lower_band_2std, upper_band_2std, lower_band_3std, upper_band_3std = calculate_bollinger_bands(data, window)

    #mean_abs_pct
    mean_abs_pct = calculate_roc(data, 1).rolling(window=window, min_periods=window).apply(lambda x: x.abs().mean()).values
        
    #alpha
    alpha = data.rolling(window=window, min_periods=window).apply(lambda x: LinearRegression().fit(x.values.reshape(-1, 1), np.arange(x.shape[0])).coef_[0]).values

    #min, max
    rolling_min = data.rolling(window=window, min_periods=window).min().values
    rolling_max = data.rolling(window=window, min_periods=window).max().values
    
    #rsi
    rsi = calculate_rsi(data, window)
    
    #roc
    roc = calculate_roc(data, window).values
    # diff = data.diff(window).values

    #exp_ma
    exp_ma = calculate_exp_ma(data, window)
    
    df_features = pd.DataFrame({f'{feat_name}_ma' : rolling_mean,
                        f'{feat_name}_std' : rolling_std,
                        f'{feat_name}_norm_std' : norm_rolling_std,
                        f'{feat_name}_ma_low_2std' : lower_band_2std,
                        f'{feat_name}_ma_up_2std' : upper_band_2std,
                        f'{feat_name}_ma_low_3std' : lower_band_3std,
                        f'{feat_name}_ma_up_3std' : upper_band_3std, 

                        f'{feat_name}_mean_abs_pct' : mean_abs_pct,
                            
                        f'{feat_name}_alpha' : alpha,
                            
                        f'{feat_name}_min' : rolling_min,
                        f'{feat_name}_max' : rolling_max,
                        f'{feat_name}_rsi' : rsi,
                        f'{feat_name}_roc' : roc,
                        # f'{feat_name}_diff' : diff,
                        f'{feat_name}_expma' : exp_ma,
                        }).astype(np.float32)
    return df_features


def calc_stats_diff_1(data, feat_name=None):
    return pd.DataFrame({f'{feat_name}_roc' : calculate_roc(data, 1).values,
                        # f'{feat_name}_diff' : data.diff(1).values,
                        }).astype(np.float32)

def calc_levels(data, window=None, levels=None, feat_name=None):
    
    #уровни
    data_levels = []
    column_names = []
    for i in range(1, len(levels)):
        level_low = levels[i-1]
        level_high = levels[i]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1+level_low)*x.values[-1] < x.values) & (x.values <= (1+level_high)*x.values[-1])).sum()).values]
        data_levels += [data.rolling(window=window, min_periods=window).apply(lambda x: (((1-level_high)*x.values[-1] <= x.values) & (x.values < (1-level_low)*x.values[-1])).sum()).values]

        column_names += [f"{feat_name}_lvl_{1+level_low}-{1+level_high}"]
        column_names += [f"{feat_name}_lvl_-{1-level_high}-{1-level_low}"]
    df_levels = pd.DataFrame({column_names[i]:data_levels[i] for i in range(len(column_names))}).astype(np.float32)
    return df_levels


In [84]:
def calculate_features(df_ticker, postfix=None):
    dfs = [df_ticker]
    
    levels =      [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]
    levels_btc =  [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.07]


    #w1
    df_close = calc_stats_diff_1(df_ticker[f'close{postfix}'], feat_name=f'close{postfix}_w1')
    df_volume = calc_stats_diff_1(df_ticker[f'volume{postfix}'], feat_name=f'volume{postfix}_w1')
    df_btc_close = calc_stats_diff_1(df_ticker[f'btc_close{postfix}'], feat_name=f'btc_close{postfix}_w1')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_btc_close.shape[0], 'Error w1'
    dfs += [df_close.copy(), df_volume.copy(), df_btc_close.copy()]

    #w5
    window = 5
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_btc_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_btc_close.copy()]
    
    #w10
    window = 10
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_volume.shape[0] == df_btc_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_volume.copy(), df_btc_close.copy()]
    
    #w20
    window = 20
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_btc_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_btc_close.copy()]
    
    #w30
    window = 30
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_btc_close_levels = calc_levels(df_ticker[f'btc_close{postfix}'], window=window, levels=levels_btc, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_btc_close.shape[0] == df_close_levels.shape[0] == df_btc_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_btc_close.copy(), df_close_levels.copy(), df_btc_close_levels.copy()]
    
    #w60
    window = 60
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_btc_close.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_btc_close.copy()]
    
    #w120
    window = 120
    df_close = calc_stats(df_ticker[f'close{postfix}'], window=window, feat_name=f'close{postfix}_w{window}')
    #df_volume = calc_stats(df_ticker[f'volume{postfix}'], window=window, feat_name=f'volume{postfix}_w{window}')
    df_btc_close = calc_stats(df_ticker[f'btc_close{postfix}'], window=window, feat_name=f'btc_close{postfix}_w{window}')
    df_close_levels = calc_levels(df_ticker[f'close{postfix}'], window=window, levels=levels, feat_name=f'close{postfix}_w{window}')
    df_btc_close_levels = calc_levels(df_ticker[f'btc_close{postfix}'], window=window, levels=levels_btc, feat_name=f'btc_close{postfix}_w{window}')
    assert df_ticker.shape[0] == df_close.shape[0] == df_btc_close.shape[0] == df_close_levels.shape[0] == df_btc_close_levels.shape[0], f'Error w{window}'
    dfs += [df_close.copy(), df_btc_close.copy(), df_close_levels.copy(), df_btc_close_levels.copy()]

   
    df = pd.concat(dfs, axis=1)
    assert (df_ticker.shape[0] == df.shape[0]) and (df.shape[1] == sum([elem.shape[1] for elem in dfs])), 'Error concat'

    return df


In [87]:
np.mean(np.abs(((df['volume'].astype(np.float32) - df['volume']) / df['volume'])) < 0.00_00_01)

0.9930967068489209

In [88]:
np.mean(np.abs(((df['close'].astype(np.float32) - df['close']) / df['close'])) < 0.00_00_01)

1.0

In [89]:
np.mean(np.abs(((df['btc_close'].astype(np.float32) - df['btc_close']) / df['btc_close'])) < 0.00_00_01)

1.0

In [91]:
np.mean(np.abs(((df_1day['volume_1day'].astype(np.float32) - df_1day['volume_1day']) / df_1day['volume_1day'])) < 0.00_00_01)

1.0

#### 5min

In [95]:
df['close'] = df['close'].astype(np.float32)
df['volume'] = df['volume'].astype(np.float32)
df['btc_close'] = df['btc_close'].astype(np.float32)

In [96]:
dfs = []
for ticker in tqdm(df['ticker'].unique()):
    mask = np.array(df['ticker'] == ticker)
    df_ticker = df.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='')
    dump_pkl(df_ticker_fe.copy(), f'tmp/{ticker}.pkl', )
    
    dfs += [df_ticker_fe.copy()]

df_fe = pd.concat(dfs).set_index('index')

100%|██████████████████████████████████████████████████████████████████████| 20/20 [3:46:42<00:00, 680.12s/it]


In [97]:
(df_fe['close'] == df['close']).all()

True

In [98]:
df_fe.shape

(5366714, 259)

In [99]:
df_fe.head()

Unnamed: 0_level_0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,...,btc_close_w120_lvl_1.01-1.02,btc_close_w120_lvl_-0.98-0.99,btc_close_w120_lvl_1.02-1.03,btc_close_w120_lvl_-0.97-0.98,btc_close_w120_lvl_1.03-1.04,btc_close_w120_lvl_-0.96-0.97,btc_close_w120_lvl_1.04-1.05,btc_close_w120_lvl_-0.95-0.96,btc_close_w120_lvl_1.05-1.07,btc_close_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-07-01 00:00:00,14.484,2683.213135,ETC-USDT,18811.0,,,WIN,0 days 02:55:00,1.028,...,,,,,,,,,,
1,2022-07-01 00:05:00,14.449,802.031372,ETC-USDT,18820.199219,,,WIN,0 days 02:45:00,1.028,...,,,,,,,,,,
2,2022-07-01 00:10:00,14.493,978.818787,ETC-USDT,18893.800781,,,WIN,0 days 02:45:00,1.028,...,,,,,,,,,,
3,2022-07-01 00:15:00,14.51,332.343292,ETC-USDT,18892.800781,,,LOSE,0 days 00:50:00,0.988,...,,,,,,,,,,
4,2022-07-01 00:20:00,14.484,446.751587,ETC-USDT,18889.099609,,,WIN,0 days 02:35:00,1.028,...,,,,,,,,,,


In [None]:
#pd.set_option('display.max_rows', 300)
# df_fe.dtypes

#### 1hour

In [101]:
df_1hour['close_1hour'] = df_1hour['close_1hour'].astype(np.float32)
df_1hour['volume_1hour'] = df_1hour['volume_1hour'].astype(np.float32)
df_1hour['btc_close_1hour'] = df_1hour['btc_close_1hour'].astype(np.float32)

In [102]:
dfs = []
for ticker in tqdm(df_1hour['ticker'].unique()):
    mask = np.array(df_1hour['ticker'] == ticker)
    df_ticker = df_1hour.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1hour')
    
    dfs += [df_ticker_fe.copy()]

df_1hour_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 20/20 [18:48<00:00, 56.42s/it]


In [103]:
(df_1hour_fe['close_1hour'] == df_1hour['close_1hour']).all(), (df_1hour_fe.index.values == df_1hour_fe['index_1hour'].values).all()

(True, True)

In [104]:
df_1hour_fe

Unnamed: 0_level_0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,btc_close_1hour,close_1hour_w1_roc,volume_1hour_w1_roc,btc_close_1hour_w1_roc,close_1hour_w5_ma,...,btc_close_1hour_w120_lvl_1.01-1.02,btc_close_1hour_w120_lvl_-0.98-0.99,btc_close_1hour_w120_lvl_1.02-1.03,btc_close_1hour_w120_lvl_-0.97-0.98,btc_close_1hour_w120_lvl_1.03-1.04,btc_close_1hour_w120_lvl_-0.96-0.97,btc_close_1hour_w120_lvl_1.04-1.05,btc_close_1hour_w120_lvl_-0.95-0.96,btc_close_1hour_w120_lvl_1.05-1.07,btc_close_1hour_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 00:00:00,14.390,6308.897949,ETC-USDT,18821.500000,,,,,...,,,,,,,,,,
1,1,2022-07-01 01:00:00,14.501,7351.315430,ETC-USDT,18895.599609,0.007714,0.165230,0.003937,,...,,,,,,,,,,
2,2,2022-07-01 02:00:00,15.016,18704.232422,ETC-USDT,19945.599609,0.035515,1.544338,0.055568,,...,,,,,,,,,,
3,3,2022-07-01 03:00:00,15.256,51126.027344,ETC-USDT,20287.500000,0.015983,1.733394,0.017142,,...,,,,,,,,,,
4,4,2022-07-01 04:00:00,15.223,12474.278320,ETC-USDT,20414.500000,-0.002163,-0.756009,0.006260,14.877200,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447222,447222,2025-02-23 19:00:00,8.965,20565.322266,UNI-USDT,95425.703125,0.001676,-0.280913,-0.000272,9.010200,...,43.0,4.0,15.0,0.0,12.0,0.0,1.0,0.0,0.0,0.0
447223,447223,2025-02-23 20:00:00,9.037,22197.498047,UNI-USDT,95660.000000,0.008031,0.079365,0.002455,9.008400,...,25.0,3.0,19.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
447224,447224,2025-02-23 21:00:00,8.986,4537.909668,UNI-USDT,95746.203125,-0.005643,-0.795567,0.000901,8.991199,...,19.0,2.0,20.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
447225,447225,2025-02-23 22:00:00,8.988,6423.965820,UNI-USDT,95763.898438,0.000223,0.415622,0.000185,8.985200,...,19.0,1.0,21.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [105]:
# pd.set_option('display.max_rows', 300)
# df_1hour_fe.dtypes

#### 1day

In [108]:
df_1day['close_1day'] = df_1day['close_1day'].astype(np.float32)
df_1day['volume_1day'] = df_1day['volume_1day'].astype(np.float32)
df_1day['btc_close_1day'] = df_1day['btc_close_1day'].astype(np.float32)

In [109]:
dfs = []
for ticker in tqdm(df_1day['ticker'].unique()):
    mask = np.array(df_1day['ticker'] == ticker)
    df_ticker = df_1day.loc[mask].copy().reset_index()

    df_ticker_fe = calculate_features(df_ticker, postfix='_1day')
    
    dfs += [df_ticker_fe.copy()]

df_1day_fe = pd.concat(dfs).set_index('index')

100%|█████████████████████████████████████████████████████████████████████████| 20/20 [00:44<00:00,  2.24s/it]


In [110]:
(df_1day_fe['close_1day'] == df_1day['close_1day']).all(), (df_1day_fe.index.values == df_1day_fe['index_1day'].values).all()

(True, True)

In [111]:
df_1day_fe

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,btc_close_1day,close_1day_w1_roc,volume_1day_w1_roc,btc_close_1day_w1_roc,close_1day_w5_ma,...,btc_close_1day_w120_lvl_1.01-1.02,btc_close_1day_w120_lvl_-0.98-0.99,btc_close_1day_w120_lvl_1.02-1.03,btc_close_1day_w120_lvl_-0.97-0.98,btc_close_1day_w120_lvl_1.03-1.04,btc_close_1day_w120_lvl_-0.96-0.97,btc_close_1day_w120_lvl_1.04-1.05,btc_close_1day_w120_lvl_-0.95-0.96,btc_close_1day_w120_lvl_1.05-1.07,btc_close_1day_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 03:00:00,14.638,2.835722e+05,ETC-USDT,19276.500000,,,,,...,,,,,,,,,,
1,1,2022-07-02 03:00:00,14.770,1.567295e+05,ETC-USDT,19251.000000,0.009018,-0.447303,-0.001323,,...,,,,,,,,,,
2,2,2022-07-03 03:00:00,14.809,1.633872e+05,ETC-USDT,19317.000000,0.002640,0.042479,0.003428,,...,,,,,,,,,,
3,3,2022-07-04 03:00:00,15.401,3.329048e+05,ETC-USDT,20235.900391,0.039976,1.037520,0.047570,,...,,,,,,,,,,
4,4,2022-07-05 03:00:00,14.924,2.905087e+05,ETC-USDT,20177.800781,-0.030972,-0.127352,-0.002871,14.908400,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18631,18631,2025-02-19 03:00:00,9.436,7.106372e+05,UNI-USDT,96644.296875,-0.020755,-0.422673,0.010325,9.684400,...,10.0,7.0,4.0,6.0,6.0,4.0,8.0,3.0,5.0,4.0
18632,18632,2025-02-20 03:00:00,9.342,1.375490e+06,UNI-USDT,98307.898438,-0.009962,0.935572,0.017214,9.589800,...,6.0,16.0,4.0,9.0,7.0,9.0,3.0,4.0,9.0,5.0
18633,18633,2025-02-21 03:00:00,8.743,1.587496e+06,UNI-USDT,96172.898438,-0.064119,0.154131,-0.021717,9.420800,...,14.0,10.0,8.0,2.0,5.0,3.0,3.0,2.0,12.0,5.0
18634,18634,2025-02-22 03:00:00,9.042,9.830577e+05,UNI-USDT,96552.101562,0.034199,-0.380749,0.003943,9.239799,...,12.0,5.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,5.0


In [100]:
dump_pkl(df_fe, './data/feat_engin/df_fe.pkl')

In [112]:
dump_pkl(df_1hour_fe, './data/feat_engin/df_1hour_fe.pkl')

In [113]:
dump_pkl(df_1day_fe, './data/feat_engin/df_1day_fe.pkl')

# #Load data

In [180]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import gc

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [372]:
df_fe = load_pkl('./data/feat_engin/df_fe.pkl')
df_1hour_fe = load_pkl('./data/feat_engin/df_1hour_fe.pkl')
df_1day_fe = load_pkl('./data/feat_engin/df_1day_fe.pkl')

df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape, 

((279072, 259), (23256, 253), (969, 253))

### time features

In [114]:
#hour
df_fe['hour'] = df_fe['time'].dt.hour

#day
df_fe['day'] = df_fe['time'].dt.day

#day_of_week
df_fe['weekday'] = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4

#month
#df_fe['month'] = df_fe['time'].dt.month


#hour
# time_cyclic = (df_fe['time'] - pd.to_datetime(df_fe['time'].dt.date) - pd.Timedelta('10:00:00')) / pd.Timedelta('13:00:00')
# df_fe['sin_time_hour'] = np.sin(time_cyclic * 2 * np.pi)
# df_fe['cos_time_hour'] = np.cos(time_cyclic * 2 * np.pi)

#day of week
# day_of_week_cyclic = np.minimum(df_fe['time'].dt.dayofweek, 4) / 4
# df_fe['sin_time_weekday'] = np.sin(day_of_week_cyclic * 2 * np.pi)
# df_fe['cos_time_weekday'] = np.cos(day_of_week_cyclic * 2 * np.pi)

#day of month
# day_of_month_cyclic = df_1day_fe['time'].dt.day / 30
# df_1day_fe['sin_time_monthday'] = np.sin(day_of_month_cyclic * 2 * np.pi)
# df_1day_fe['cos_time_monthday'] = np.cos(day_of_month_cyclic * 2 * np.pi)


In [116]:
df_fe.head()

Unnamed: 0_level_0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,...,btc_close_w120_lvl_-0.97-0.98,btc_close_w120_lvl_1.03-1.04,btc_close_w120_lvl_-0.96-0.97,btc_close_w120_lvl_1.04-1.05,btc_close_w120_lvl_-0.95-0.96,btc_close_w120_lvl_1.05-1.07,btc_close_w120_lvl_-0.9299999999999999-0.95,hour,day,weekday
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2022-07-01 00:00:00,14.484,2683.213135,ETC-USDT,18811.0,,,WIN,0 days 02:55:00,1.028,...,,,,,,,,0,1,1.0
1,2022-07-01 00:05:00,14.449,802.031372,ETC-USDT,18820.199219,,,WIN,0 days 02:45:00,1.028,...,,,,,,,,0,1,1.0
2,2022-07-01 00:10:00,14.493,978.818787,ETC-USDT,18893.800781,,,WIN,0 days 02:45:00,1.028,...,,,,,,,,0,1,1.0
3,2022-07-01 00:15:00,14.51,332.343292,ETC-USDT,18892.800781,,,LOSE,0 days 00:50:00,0.988,...,,,,,,,,0,1,1.0
4,2022-07-01 00:20:00,14.484,446.751587,ETC-USDT,18889.099609,,,WIN,0 days 02:35:00,1.028,...,,,,,,,,0,1,1.0


In [117]:
df_1hour_fe.head()

Unnamed: 0_level_0,index_1hour,time_1hour,close_1hour,volume_1hour,ticker,btc_close_1hour,close_1hour_w1_roc,volume_1hour_w1_roc,btc_close_1hour_w1_roc,close_1hour_w5_ma,...,btc_close_1hour_w120_lvl_1.01-1.02,btc_close_1hour_w120_lvl_-0.98-0.99,btc_close_1hour_w120_lvl_1.02-1.03,btc_close_1hour_w120_lvl_-0.97-0.98,btc_close_1hour_w120_lvl_1.03-1.04,btc_close_1hour_w120_lvl_-0.96-0.97,btc_close_1hour_w120_lvl_1.04-1.05,btc_close_1hour_w120_lvl_-0.95-0.96,btc_close_1hour_w120_lvl_1.05-1.07,btc_close_1hour_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 00:00:00,14.39,6308.897949,ETC-USDT,18821.5,,,,,...,,,,,,,,,,
1,1,2022-07-01 01:00:00,14.501,7351.31543,ETC-USDT,18895.599609,0.007714,0.16523,0.003937,,...,,,,,,,,,,
2,2,2022-07-01 02:00:00,15.016,18704.232422,ETC-USDT,19945.599609,0.035515,1.544338,0.055568,,...,,,,,,,,,,
3,3,2022-07-01 03:00:00,15.256,51126.027344,ETC-USDT,20287.5,0.015983,1.733394,0.017142,,...,,,,,,,,,,
4,4,2022-07-01 04:00:00,15.223,12474.27832,ETC-USDT,20414.5,-0.002163,-0.756009,0.00626,14.8772,...,,,,,,,,,,


In [119]:
df_1day_fe.head()

Unnamed: 0_level_0,index_1day,time_1day,close_1day,volume_1day,ticker,btc_close_1day,close_1day_w1_roc,volume_1day_w1_roc,btc_close_1day_w1_roc,close_1day_w5_ma,...,btc_close_1day_w120_lvl_1.01-1.02,btc_close_1day_w120_lvl_-0.98-0.99,btc_close_1day_w120_lvl_1.02-1.03,btc_close_1day_w120_lvl_-0.97-0.98,btc_close_1day_w120_lvl_1.03-1.04,btc_close_1day_w120_lvl_-0.96-0.97,btc_close_1day_w120_lvl_1.04-1.05,btc_close_1day_w120_lvl_-0.95-0.96,btc_close_1day_w120_lvl_1.05-1.07,btc_close_1day_w120_lvl_-0.9299999999999999-0.95
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,2022-07-01 03:00:00,14.638,283572.15625,ETC-USDT,19276.5,,,,,...,,,,,,,,,,
1,1,2022-07-02 03:00:00,14.77,156729.546875,ETC-USDT,19251.0,0.009018,-0.447303,-0.001323,,...,,,,,,,,,,
2,2,2022-07-03 03:00:00,14.809,163387.25,ETC-USDT,19317.0,0.00264,0.042479,0.003428,,...,,,,,,,,,,
3,3,2022-07-04 03:00:00,15.401,332904.84375,ETC-USDT,20235.900391,0.039976,1.03752,0.04757,,...,,,,,,,,,,
4,4,2022-07-05 03:00:00,14.924,290508.6875,ETC-USDT,20177.800781,-0.030972,-0.127352,-0.002871,14.9084,...,,,,,,,,,,


### resize memory

#### #resize dtype

In [124]:
[elem for elem in df_fe.columns if 'ind' in elem]

['index_1hour', 'index_1day', 'res_ind']

In [125]:
cols = [elem for elem in df_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)

  8%|██████                                                                 | 22/259 [00:00<00:01, 219.35it/s]

time
ticker
result
delta_time


100%|███████████████████████████████████████████████████████████████████████| 259/259 [00:02<00:00, 86.39it/s]


In [128]:
[elem for elem in df_1hour_fe.columns if 'ind' in elem]

['index_1hour']

In [129]:
cols = [elem for elem in df_1hour_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1hour_fe[col] = df_1hour_fe[col].astype(np.float32)
    except:
        print(col)

 98%|███████████████████████████████████████████████████████████████████▋ | 247/252 [00:00<00:00, 1230.60it/s]

time_1hour
ticker


100%|█████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 1220.62it/s]


In [131]:
[elem for elem in df_1day_fe.columns if 'ind' in elem]

['index_1day']

In [132]:
cols = [elem for elem in df_1day_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

100%|████████████████████████████████████████████████████████████████████| 252/252 [00:00<00:00, 19122.62it/s]

time_1day
ticker





#### #subsample

##### #1 time

In [135]:
df_fe.shape[0]

5366714

In [136]:
# некоторые акции имеют более короткую историю
NEED_POINTS = 125 #чтоб точно хватило
mask_avbl_all = np.zeros(df_fe.shape[0]).astype(bool)
for ticker in tqdm(df_fe["ticker"].unique()):
    mask_ticker = np.array(df_fe['ticker'] == ticker)
    count_days = df_fe.loc[mask_ticker, 'time'].dt.date.nunique()
    if count_days < NEED_POINTS:
        mask_avbl = np.zeros(df_fe.shape[0]).astype(bool)
    else:
        date_first_avbl =  np.sort(df_fe.loc[mask_ticker, 'time'].dt.date.unique())[NEED_POINTS]
        #print(ticker, date_first_avbl)
        mask_avbl = np.array(df_fe["time"] >= pd.to_datetime(date_first_avbl))
    
    mask_avbl_ticker = mask_ticker & mask_avbl

    mask_avbl_all |= mask_avbl_ticker

mask_avbl_all.sum(), mask_avbl_all.mean()

100%|█████████████████████████████████████████████████████████████████████████| 20/20 [00:03<00:00,  5.75it/s]


(4647168, 0.8659242881211855)

In [182]:
print(df_fe.shape, '-->')
df_fe = df_fe[mask_avbl_all]
df_fe.reset_index(drop=True, inplace=True)
gc.collect()
df_fe.shape

(4647168, 262)

In [184]:
df_fe['ticker'].nunique(), df_fe['ticker'].value_counts()

(20,
 ticker
 ETC-USDT     243072
 XRP-USDT     243072
 AAVE-USDT    243072
 NEAR-USDT    243072
 OP-USDT      243072
 DOT-USDT     243072
 HBAR-USDT    243072
 TON-USDT     243072
 LTC-USDT     243072
 XLM-USDT     243072
 AVAX-USDT    243072
 LINK-USDT    243072
 TRX-USDT     243072
 ADA-USDT     243072
 DOGE-USDT    243072
 SOL-USDT     243072
 UNI-USDT     243072
 BNB-USDT     193248
 ARB-USDT     166752
 SUI-USDT     154944
 Name: count, dtype: int64)

#### #2 subsample

In [188]:
df_fe.shape

(4647168, 262)

In [190]:
coeff_subsample = 0.5

inds = np.arange(df_fe.shape[0]).astype(int)
inds_subsample = np.random.RandomState(seed=42).permutation(inds)[:int(len(inds)*coeff_subsample)]

mask_subsample = np.zeros(df_fe.shape[0]).astype(bool)
mask_subsample[inds_subsample] = True

mask_subsample.sum(), mask_subsample.mean()

(2323584, 0.5)

In [192]:
print(df_fe.shape, '-->')
df_fe = df_fe[mask_subsample]
df_fe.reset_index(drop=True, inplace=True)
gc.collect()

df_fe.shape

(4647168, 262) -->


(2323584, 262)

In [195]:
df_fe['ticker'].nunique(), df_fe['ticker'].value_counts()

(20,
 ticker
 LTC-USDT     121993
 DOT-USDT     121913
 HBAR-USDT    121704
 AAVE-USDT    121682
 TRX-USDT     121658
 ETC-USDT     121632
 XLM-USDT     121610
 ADA-USDT     121598
 NEAR-USDT    121561
 DOGE-USDT    121497
 LINK-USDT    121424
 OP-USDT      121397
 XRP-USDT     121378
 SOL-USDT     121368
 TON-USDT     121342
 AVAX-USDT    121173
 UNI-USDT     121097
 BNB-USDT      96704
 ARB-USDT      83404
 SUI-USDT      77449
 Name: count, dtype: int64)

In [198]:
assert df_fe[['time', 'ticker']].groupby(['time', 'ticker']).count().shape[0] == df_fe.shape[0], 'Error'

In [201]:
gc.collect()

0

In [203]:
[elem for elem in df_fe.columns if 'ind' in elem]

['index_1hour', 'index_1day', 'res_ind']

In [205]:
cols = [elem for elem in df_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_fe[col] = df_fe[col].astype(np.float32)
    except:
        print(col)


 14%|█████████▌                                                             | 35/259 [00:00<00:00, 342.37it/s]

time
ticker
result
delta_time


100%|██████████████████████████████████████████████████████████████████████| 259/259 [00:00<00:00, 414.40it/s]


In [28]:
[elem for elem in df_1hour_fe.columns if 'ind' in elem]

['index_1hour', 'date_hour_index']

In [30]:
cols = [elem for elem in df_1hour_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1hour_fe[col] = df_1hour_fe[col].astype(np.float32)
    except:
        print(col)

 21%|██████████████▌                                                        | 60/292 [00:00<00:00, 316.19it/s]

time_1hour
ticker


100%|██████████████████████████████████████████████████████████████████████| 292/292 [00:00<00:00, 451.29it/s]


In [33]:
[elem for elem in df_1day_fe.columns if 'ind' in elem]

['index_1day', 'date_index']

In [35]:
cols = [elem for elem in df_1day_fe.columns if not 'ind' in elem]

for col in tqdm(cols):
    try:
        df_1day_fe[col] = df_1day_fe[col].astype(np.float32)
    except:
        print(col)

100%|█████████████████████████████████████████████████████████████████████| 292/292 [00:00<00:00, 4519.71it/s]

time_1day
ticker





### Delete useless (dublicated) columns


In [39]:
#del df_1hour_fe['date_hour_index']

In [41]:
#del df_1day_fe['date_index']

### Absolute value columns

In [207]:
def flag_delete(col_name):
    if 'norm_std' in col_name:
        return False
    
    for stop_word in ['ma', 'std', 'diff', 'min', 'max']:
        if stop_word in col_name:
            return True
    return False



In [209]:
cols_del_5min = [elem for elem in df_fe.columns if flag_delete(elem)]
len(cols_del_5min), cols_del_5min

(126,
 ['close_w5_ma',
  'close_w5_std',
  'close_w5_ma_low_2std',
  'close_w5_ma_up_2std',
  'close_w5_ma_low_3std',
  'close_w5_ma_up_3std',
  'close_w5_min',
  'close_w5_max',
  'close_w5_expma',
  'volume_w5_ma',
  'volume_w5_std',
  'volume_w5_ma_low_2std',
  'volume_w5_ma_up_2std',
  'volume_w5_ma_low_3std',
  'volume_w5_ma_up_3std',
  'volume_w5_min',
  'volume_w5_max',
  'volume_w5_expma',
  'btc_close_w5_ma',
  'btc_close_w5_std',
  'btc_close_w5_ma_low_2std',
  'btc_close_w5_ma_up_2std',
  'btc_close_w5_ma_low_3std',
  'btc_close_w5_ma_up_3std',
  'btc_close_w5_min',
  'btc_close_w5_max',
  'btc_close_w5_expma',
  'close_w10_ma',
  'close_w10_std',
  'close_w10_ma_low_2std',
  'close_w10_ma_up_2std',
  'close_w10_ma_low_3std',
  'close_w10_ma_up_3std',
  'close_w10_min',
  'close_w10_max',
  'close_w10_expma',
  'volume_w10_ma',
  'volume_w10_std',
  'volume_w10_ma_low_2std',
  'volume_w10_ma_up_2std',
  'volume_w10_ma_low_3std',
  'volume_w10_ma_up_3std',
  'volume_w10_min',

In [211]:
cols_del_1hour = [elem for elem in df_1hour_fe.columns if flag_delete(elem)]
len(cols_del_1hour), cols_del_1hour

(126,
 ['close_1hour_w5_ma',
  'close_1hour_w5_std',
  'close_1hour_w5_ma_low_2std',
  'close_1hour_w5_ma_up_2std',
  'close_1hour_w5_ma_low_3std',
  'close_1hour_w5_ma_up_3std',
  'close_1hour_w5_min',
  'close_1hour_w5_max',
  'close_1hour_w5_expma',
  'volume_1hour_w5_ma',
  'volume_1hour_w5_std',
  'volume_1hour_w5_ma_low_2std',
  'volume_1hour_w5_ma_up_2std',
  'volume_1hour_w5_ma_low_3std',
  'volume_1hour_w5_ma_up_3std',
  'volume_1hour_w5_min',
  'volume_1hour_w5_max',
  'volume_1hour_w5_expma',
  'btc_close_1hour_w5_ma',
  'btc_close_1hour_w5_std',
  'btc_close_1hour_w5_ma_low_2std',
  'btc_close_1hour_w5_ma_up_2std',
  'btc_close_1hour_w5_ma_low_3std',
  'btc_close_1hour_w5_ma_up_3std',
  'btc_close_1hour_w5_min',
  'btc_close_1hour_w5_max',
  'btc_close_1hour_w5_expma',
  'close_1hour_w10_ma',
  'close_1hour_w10_std',
  'close_1hour_w10_ma_low_2std',
  'close_1hour_w10_ma_up_2std',
  'close_1hour_w10_ma_low_3std',
  'close_1hour_w10_ma_up_3std',
  'close_1hour_w10_min',
  'c

In [213]:
cols_del_1day = [elem for elem in df_1day_fe.columns if flag_delete(elem)]
len(cols_del_1day), cols_del_1day

(126,
 ['close_1day_w5_ma',
  'close_1day_w5_std',
  'close_1day_w5_ma_low_2std',
  'close_1day_w5_ma_up_2std',
  'close_1day_w5_ma_low_3std',
  'close_1day_w5_ma_up_3std',
  'close_1day_w5_min',
  'close_1day_w5_max',
  'close_1day_w5_expma',
  'volume_1day_w5_ma',
  'volume_1day_w5_std',
  'volume_1day_w5_ma_low_2std',
  'volume_1day_w5_ma_up_2std',
  'volume_1day_w5_ma_low_3std',
  'volume_1day_w5_ma_up_3std',
  'volume_1day_w5_min',
  'volume_1day_w5_max',
  'volume_1day_w5_expma',
  'btc_close_1day_w5_ma',
  'btc_close_1day_w5_std',
  'btc_close_1day_w5_ma_low_2std',
  'btc_close_1day_w5_ma_up_2std',
  'btc_close_1day_w5_ma_low_3std',
  'btc_close_1day_w5_ma_up_3std',
  'btc_close_1day_w5_min',
  'btc_close_1day_w5_max',
  'btc_close_1day_w5_expma',
  'close_1day_w10_ma',
  'close_1day_w10_std',
  'close_1day_w10_ma_low_2std',
  'close_1day_w10_ma_up_2std',
  'close_1day_w10_ma_low_3std',
  'close_1day_w10_ma_up_3std',
  'close_1day_w10_min',
  'close_1day_w10_max',
  'close_1day_

## Concat

In [215]:
df_fe.columns.tolist()

['time',
 'close',
 'volume',
 'ticker',
 'btc_close',
 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',
 'close_w1_roc',
 'volume_w1_roc',
 'btc_close_w1_roc',
 'close_w5_ma',
 'close_w5_std',
 'close_w5_norm_std',
 'close_w5_ma_low_2std',
 'close_w5_ma_up_2std',
 'close_w5_ma_low_3std',
 'close_w5_ma_up_3std',
 'close_w5_mean_abs_pct',
 'close_w5_alpha',
 'close_w5_min',
 'close_w5_max',
 'close_w5_rsi',
 'close_w5_roc',
 'close_w5_expma',
 'volume_w5_ma',
 'volume_w5_std',
 'volume_w5_norm_std',
 'volume_w5_ma_low_2std',
 'volume_w5_ma_up_2std',
 'volume_w5_ma_low_3std',
 'volume_w5_ma_up_3std',
 'volume_w5_mean_abs_pct',
 'volume_w5_alpha',
 'volume_w5_min',
 'volume_w5_max',
 'volume_w5_rsi',
 'volume_w5_roc',
 'volume_w5_expma',
 'btc_close_w5_ma',
 'btc_close_w5_std',
 'btc_close_w5_norm_std',
 'btc_close_w5_ma_low_2std',
 'btc_close_w5_ma_up_2std',
 'btc_close_w5_ma_low_3std',
 'btc_close_w5_ma_up_3std',
 'btc_close_w5_mean_abs_p

In [217]:
df_1hour_fe.columns.tolist()

['index_1hour',
 'time_1hour',
 'close_1hour',
 'volume_1hour',
 'ticker',
 'btc_close_1hour',
 'close_1hour_w1_roc',
 'volume_1hour_w1_roc',
 'btc_close_1hour_w1_roc',
 'close_1hour_w5_ma',
 'close_1hour_w5_std',
 'close_1hour_w5_norm_std',
 'close_1hour_w5_ma_low_2std',
 'close_1hour_w5_ma_up_2std',
 'close_1hour_w5_ma_low_3std',
 'close_1hour_w5_ma_up_3std',
 'close_1hour_w5_mean_abs_pct',
 'close_1hour_w5_alpha',
 'close_1hour_w5_min',
 'close_1hour_w5_max',
 'close_1hour_w5_rsi',
 'close_1hour_w5_roc',
 'close_1hour_w5_expma',
 'volume_1hour_w5_ma',
 'volume_1hour_w5_std',
 'volume_1hour_w5_norm_std',
 'volume_1hour_w5_ma_low_2std',
 'volume_1hour_w5_ma_up_2std',
 'volume_1hour_w5_ma_low_3std',
 'volume_1hour_w5_ma_up_3std',
 'volume_1hour_w5_mean_abs_pct',
 'volume_1hour_w5_alpha',
 'volume_1hour_w5_min',
 'volume_1hour_w5_max',
 'volume_1hour_w5_rsi',
 'volume_1hour_w5_roc',
 'volume_1hour_w5_expma',
 'btc_close_1hour_w5_ma',
 'btc_close_1hour_w5_std',
 'btc_close_1hour_w5_norm_

In [219]:
df_1day_fe.columns.tolist()

['index_1day',
 'time_1day',
 'close_1day',
 'volume_1day',
 'ticker',
 'btc_close_1day',
 'close_1day_w1_roc',
 'volume_1day_w1_roc',
 'btc_close_1day_w1_roc',
 'close_1day_w5_ma',
 'close_1day_w5_std',
 'close_1day_w5_norm_std',
 'close_1day_w5_ma_low_2std',
 'close_1day_w5_ma_up_2std',
 'close_1day_w5_ma_low_3std',
 'close_1day_w5_ma_up_3std',
 'close_1day_w5_mean_abs_pct',
 'close_1day_w5_alpha',
 'close_1day_w5_min',
 'close_1day_w5_max',
 'close_1day_w5_rsi',
 'close_1day_w5_roc',
 'close_1day_w5_expma',
 'volume_1day_w5_ma',
 'volume_1day_w5_std',
 'volume_1day_w5_norm_std',
 'volume_1day_w5_ma_low_2std',
 'volume_1day_w5_ma_up_2std',
 'volume_1day_w5_ma_low_3std',
 'volume_1day_w5_ma_up_3std',
 'volume_1day_w5_mean_abs_pct',
 'volume_1day_w5_alpha',
 'volume_1day_w5_min',
 'volume_1day_w5_max',
 'volume_1day_w5_rsi',
 'volume_1day_w5_roc',
 'volume_1day_w5_expma',
 'btc_close_1day_w5_ma',
 'btc_close_1day_w5_std',
 'btc_close_1day_w5_norm_std',
 'btc_close_1day_w5_ma_low_2std',

In [222]:
df_fe.shape, df_1hour_fe.shape, df_1day_fe.shape

((2323584, 262), (447227, 253), (18636, 253))

In [224]:
df = df_fe.merge(df_1hour_fe, on=['index_1hour', 'ticker'], how='left')
df.shape

(2323584, 513)

In [225]:
df = df.merge(df_1day_fe, on=['index_1day', 'ticker'], how='left')
df.shape

(2323584, 764)

In [228]:
df

Unnamed: 0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,...,btc_close_1day_w120_lvl_1.01-1.02,btc_close_1day_w120_lvl_-0.98-0.99,btc_close_1day_w120_lvl_1.02-1.03,btc_close_1day_w120_lvl_-0.97-0.98,btc_close_1day_w120_lvl_1.03-1.04,btc_close_1day_w120_lvl_-0.96-0.97,btc_close_1day_w120_lvl_1.04-1.05,btc_close_1day_w120_lvl_-0.95-0.96,btc_close_1day_w120_lvl_1.05-1.07,btc_close_1day_w120_lvl_-0.9299999999999999-0.95
0,2022-11-03 00:00:00,22.830000,5324.912598,ETC-USDT,20161.699219,2999.0,123.0,DNF,0 days 04:00:00,1.018193,...,6.0,9.0,0.0,6.0,3.0,5.0,4.0,7.0,8.0,26.0
1,2022-11-03 00:05:00,22.825001,6226.672852,ETC-USDT,20139.900391,2999.0,123.0,DNF,0 days 04:00:00,1.017146,...,6.0,9.0,0.0,6.0,3.0,5.0,4.0,7.0,8.0,26.0
2,2022-11-03 00:10:00,22.802999,1182.641846,ETC-USDT,20150.099609,2999.0,123.0,DNF,0 days 04:00:00,1.019006,...,6.0,9.0,0.0,6.0,3.0,5.0,4.0,7.0,8.0,26.0
3,2022-11-03 00:15:00,22.784000,1238.016602,ETC-USDT,20163.699219,2999.0,123.0,DNF,0 days 04:00:00,1.020472,...,6.0,9.0,0.0,6.0,3.0,5.0,4.0,7.0,8.0,26.0
4,2022-11-03 00:25:00,22.872999,6041.620605,ETC-USDT,20101.599609,2999.0,123.0,DNF,0 days 04:00:00,1.016362,...,6.0,9.0,0.0,6.0,3.0,5.0,4.0,7.0,8.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2323579,2025-02-23 23:15:00,8.982000,217.999313,UNI-USDT,95762.796875,447225.0,18634.0,DNF,0 days 00:40:00,1.001117,...,12.0,5.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,5.0
2323580,2025-02-23 23:25:00,8.971000,130.078323,UNI-USDT,95734.000000,447225.0,18634.0,DNF,0 days 00:30:00,1.002347,...,12.0,5.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,5.0
2323581,2025-02-23 23:30:00,8.959000,429.662842,UNI-USDT,95695.500000,447225.0,18634.0,DNF,0 days 00:25:00,1.003693,...,12.0,5.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,5.0
2323582,2025-02-23 23:50:00,9.001000,219.996490,UNI-USDT,95796.000000,447225.0,18634.0,DNF,0 days 00:05:00,0.999000,...,12.0,5.0,4.0,7.0,5.0,3.0,7.0,3.0,7.0,5.0


In [231]:
df.loc[df['index_1hour'].isnull(), 'time'].dt.date.unique()

array([], dtype=object)

In [233]:
df.loc[df['index_1day'].isnull(), 'time'].dt.date.unique()

array([], dtype=object)

In [236]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'
    # if df[col].isnull().sum() != 0:
    #     print(col, df[col].isnull().sum())

100%|██████████████████████████████████████████████████████████████████████| 764/764 [00:01<00:00, 479.44it/s]


In [None]:
#df.loc[df['volume_w1_roc'].isnull()]

### Relative features

In [238]:
groups_5min = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #btc_close: w1, w5, w10, w20, w30, w60, w120
         ['close_w1_roc', 'close_w5_alpha', 'close_w10_alpha', 'close_w20_alpha', 'close_w30_alpha', 'close_w60_alpha', 'close_w120_alpha'],
         ['volume_w1_roc', 'volume_w5_alpha', 'volume_w10_alpha'],
         ['btc_close_w1_roc', 'btc_close_w5_alpha', 'btc_close_w10_alpha', 'btc_close_w20_alpha', 'btc_close_w30_alpha', 'btc_close_w60_alpha', 'btc_close_w120_alpha'],
    
    
         ['close_w1_roc', 'close_w5_roc', 'close_w10_roc', 'close_w20_roc', 'close_w30_roc', 'close_w60_roc', 'close_w120_roc'],
         ['volume_w1_roc', 'volume_w5_roc', 'volume_w10_roc'],
         ['btc_close_w1_roc', 'btc_close_w5_roc', 'btc_close_w10_roc', 'btc_close_w20_roc', 'btc_close_w30_roc', 'btc_close_w60_roc', 'btc_close_w120_roc'],

    
         ['close_w5_mean_abs_pct', 'close_w10_mean_abs_pct', 'close_w20_mean_abs_pct', 'close_w30_mean_abs_pct', 'close_w60_mean_abs_pct', 'close_w120_mean_abs_pct'],
         ['volume_w5_mean_abs_pct', 'volume_w10_mean_abs_pct'],
         ['btc_close_w5_mean_abs_pct', 'btc_close_w10_mean_abs_pct', 'btc_close_w20_mean_abs_pct', 'btc_close_w30_mean_abs_pct', 'btc_close_w60_mean_abs_pct', 'btc_close_w120_mean_abs_pct'],


         ['close_w5_std', 'close_w10_std', 'close_w20_std', 'close_w30_std', 'close_w60_std', 'close_w120_std'],
         ['volume_w5_std', 'volume_w10_std'],
         ['btc_close_w5_std', 'btc_close_w10_std', 'btc_close_w20_std', 'btc_close_w30_std', 'btc_close_w60_std', 'btc_close_w120_std'],


         ['close_w5_norm_std', 'close_w10_norm_std', 'close_w20_norm_std', 'close_w30_norm_std', 'close_w60_norm_std', 'close_w120_norm_std'],
         ['volume_w5_norm_std', 'volume_w10_norm_std'],
         ['btc_close_w5_norm_std', 'btc_close_w10_norm_std', 'btc_close_w20_norm_std', 'btc_close_w30_norm_std', 'btc_close_w60_norm_std', 'btc_close_w120_norm_std'],


         ['close_w5_rsi', 'close_w10_rsi', 'close_w20_rsi', 'close_w30_rsi', 'close_w60_rsi', 'close_w120_rsi'],
         ['volume_w5_rsi', 'volume_w10_rsi'],
         ['btc_close_w5_rsi', 'btc_close_w10_rsi', 'btc_close_w20_rsi', 'btc_close_w30_rsi', 'btc_close_w60_rsi', 'btc_close_w120_rsi'],


         ['close', 'close_w5_ma', 'close_w10_ma', 'close_w20_ma', 'close_w30_ma', 'close_w60_ma', 'close_w120_ma'],
         ['volume', 'volume_w5_ma', 'volume_w10_ma'],
         ['btc_close', 'btc_close_w5_ma', 'btc_close_w10_ma', 'btc_close_w20_ma', 'btc_close_w30_ma', 'btc_close_w60_ma', 'btc_close_w120_ma'],


         ['close', 'close_w5_expma', 'close_w10_expma', 'close_w20_expma', 'close_w30_expma', 'close_w60_expma', 'close_w120_expma'],
         ['volume', 'volume_w5_expma', 'volume_w10_expma'],
         ['btc_close', 'btc_close_w5_expma', 'btc_close_w10_expma', 'btc_close_w20_expma', 'btc_close_w30_expma', 'btc_close_w60_expma', 'btc_close_w120_expma'],


         {'close' : ['close_w5_min', 'close_w10_min', 'close_w20_min', 'close_w30_min', 'close_w60_min', 'close_w120_min']},
         {'volume' : ['volume_w5_min', 'volume_w10_min']},
         {'btc_close' : ['btc_close_w5_min', 'btc_close_w10_min', 'btc_close_w20_min', 'btc_close_w30_min', 'btc_close_w60_min', 'btc_close_w120_min']},

    
         {'close' : ['close_w5_max', 'close_w10_max', 'close_w20_max', 'close_w30_max', 'close_w60_max', 'close_w120_max']},
         {'volume' : ['volume_w5_max', 'volume_w10_max']},
         {'btc_close' : ['btc_close_w5_max', 'btc_close_w10_max', 'btc_close_w20_max', 'btc_close_w30_max', 'btc_close_w60_max', 'btc_close_w120_max']},



        #w5
         {'close' : [  'close_w5_ma_low_2std', 'close_w5_ma_up_2std', 'close_w5_ma_low_3std', 'close_w5_ma_up_3std']},
         {'volume' : [  'volume_w5_ma_low_2std', 'volume_w5_ma_up_2std', 'volume_w5_ma_low_3std', 'volume_w5_ma_up_3std']},
         {'btc_close' : [  'btc_close_w5_ma_low_2std', 'btc_close_w5_ma_up_2std', 'btc_close_w5_ma_low_3std', 'btc_close_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_w10_ma_low_2std', 'close_w10_ma_up_2std', 'close_w10_ma_low_3std', 'close_w10_ma_up_3std']},
         {'volume' : [  'volume_w10_ma_low_2std', 'volume_w10_ma_up_2std', 'volume_w10_ma_low_3std', 'volume_w10_ma_up_3std']},
         {'btc_close' : [  'btc_close_w10_ma_low_2std', 'btc_close_w10_ma_up_2std', 'btc_close_w10_ma_low_3std', 'btc_close_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_w20_ma_low_2std', 'close_w20_ma_up_2std', 'close_w20_ma_low_3std', 'close_w20_ma_up_3std']},
         {'btc_close' : [  'btc_close_w20_ma_low_2std', 'btc_close_w20_ma_up_2std', 'btc_close_w20_ma_low_3std', 'btc_close_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_w30_ma_low_2std', 'close_w30_ma_up_2std', 'close_w30_ma_low_3std', 'close_w30_ma_up_3std']},
         {'btc_close' : [  'btc_close_w30_ma_low_2std', 'btc_close_w30_ma_up_2std', 'btc_close_w30_ma_low_3std', 'btc_close_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_w60_ma_low_2std', 'close_w60_ma_up_2std', 'close_w60_ma_low_3std', 'close_w60_ma_up_3std']},
         {'btc_close' : [  'btc_close_w60_ma_low_2std', 'btc_close_w60_ma_up_2std', 'btc_close_w60_ma_low_3std', 'btc_close_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_w120_ma_low_2std', 'close_w120_ma_up_2std', 'close_w120_ma_low_3std', 'close_w120_ma_up_3std']},
         {'btc_close' : [  'btc_close_w120_ma_low_2std', 'btc_close_w120_ma_up_2std', 'btc_close_w120_ma_low_3std', 'btc_close_w120_ma_up_3std']},


        #w5
        ['close_w5_min', 'close_w5_max'],
        ['volume_w5_min', 'volume_w5_max'],
        ['btc_close_w5_min', 'btc_close_w5_max'],
        #w10
        ['close_w10_min', 'close_w10_max'],
        ['volume_w10_min', 'volume_w10_max'],
        ['btc_close_w10_min', 'btc_close_w10_max'],
        #w20
        ['close_w20_min', 'close_w20_max'],
        ['btc_close_w20_min', 'btc_close_w20_max'],
        #w30
        ['close_w30_min', 'close_w30_max'],
        ['btc_close_w30_min', 'btc_close_w30_max'],
        #w60
        ['close_w60_min', 'close_w60_max'],
        ['btc_close_w60_min', 'btc_close_w60_max'],
        #w120
        ['close_w120_min', 'close_w120_max'],
        ['btc_close_w120_min', 'btc_close_w120_max'],
]

In [240]:
groups_1hour = [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #btc_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1hour_w1_roc', 'close_1hour_w5_alpha', 'close_1hour_w10_alpha', 'close_1hour_w20_alpha', 'close_1hour_w30_alpha', 'close_1hour_w60_alpha', 'close_1hour_w120_alpha'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_alpha', 'volume_1hour_w10_alpha'],
         ['btc_close_1hour_w1_roc', 'btc_close_1hour_w5_alpha', 'btc_close_1hour_w10_alpha', 'btc_close_1hour_w20_alpha', 'btc_close_1hour_w30_alpha', 'btc_close_1hour_w60_alpha', 'btc_close_1hour_w120_alpha'],
    
         ['close_1hour_w1_roc', 'close_1hour_w5_roc', 'close_1hour_w10_roc', 'close_1hour_w20_roc', 'close_1hour_w30_roc', 'close_1hour_w60_roc', 'close_1hour_w120_roc'],
         ['volume_1hour_w1_roc', 'volume_1hour_w5_roc', 'volume_1hour_w10_roc'],
         ['btc_close_1hour_w1_roc', 'btc_close_1hour_w5_roc', 'btc_close_1hour_w10_roc', 'btc_close_1hour_w20_roc', 'btc_close_1hour_w30_roc', 'btc_close_1hour_w60_roc', 'btc_close_1hour_w120_roc'],

         ['close_1hour_w5_mean_abs_pct', 'close_1hour_w10_mean_abs_pct', 'close_1hour_w20_mean_abs_pct', 'close_1hour_w30_mean_abs_pct', 'close_1hour_w60_mean_abs_pct', 'close_1hour_w120_mean_abs_pct'],
         ['volume_1hour_w5_mean_abs_pct', 'volume_1hour_w10_mean_abs_pct'],
         ['btc_close_1hour_w5_mean_abs_pct', 'btc_close_1hour_w10_mean_abs_pct', 'btc_close_1hour_w20_mean_abs_pct', 'btc_close_1hour_w30_mean_abs_pct', 'btc_close_1hour_w60_mean_abs_pct', 'btc_close_1hour_w120_mean_abs_pct'],


         ['close_1hour_w5_std', 'close_1hour_w10_std', 'close_1hour_w20_std', 'close_1hour_w30_std', 'close_1hour_w60_std', 'close_1hour_w120_std'],
         ['volume_1hour_w5_std', 'volume_1hour_w10_std'],
         ['btc_close_1hour_w5_std', 'btc_close_1hour_w10_std', 'btc_close_1hour_w20_std', 'btc_close_1hour_w30_std', 'btc_close_1hour_w60_std', 'btc_close_1hour_w120_std'],


         ['close_1hour_w5_norm_std', 'close_1hour_w10_norm_std', 'close_1hour_w20_norm_std', 'close_1hour_w30_norm_std', 'close_1hour_w60_norm_std', 'close_1hour_w120_norm_std'],
         ['volume_1hour_w5_norm_std', 'volume_1hour_w10_norm_std'],
         ['btc_close_1hour_w5_norm_std', 'btc_close_1hour_w10_norm_std', 'btc_close_1hour_w20_norm_std', 'btc_close_1hour_w30_norm_std', 'btc_close_1hour_w60_norm_std', 'btc_close_1hour_w120_norm_std'],


         ['close_1hour_w5_rsi', 'close_1hour_w10_rsi', 'close_1hour_w20_rsi', 'close_1hour_w30_rsi', 'close_1hour_w60_rsi', 'close_1hour_w120_rsi'],
         ['volume_1hour_w5_rsi', 'volume_1hour_w10_rsi'],
         ['btc_close_1hour_w5_rsi', 'btc_close_1hour_w10_rsi', 'btc_close_1hour_w20_rsi', 'btc_close_1hour_w30_rsi', 'btc_close_1hour_w60_rsi', 'btc_close_1hour_w120_rsi'],


         ['close', 'close_1hour_w5_ma', 'close_1hour_w10_ma', 'close_1hour_w20_ma', 'close_1hour_w30_ma', 'close_1hour_w60_ma', 'close_1hour_w120_ma'],
         ['volume', 'volume_1hour_w5_ma', 'volume_1hour_w10_ma'],
         ['btc_close', 'btc_close_1hour_w5_ma', 'btc_close_1hour_w10_ma', 'btc_close_1hour_w20_ma', 'btc_close_1hour_w30_ma', 'btc_close_1hour_w60_ma', 'btc_close_1hour_w120_ma'],


         ['close', 'close_1hour_w5_expma', 'close_1hour_w10_expma', 'close_1hour_w20_expma', 'close_1hour_w30_expma', 'close_1hour_w60_expma', 'close_1hour_w120_expma'],
         ['volume', 'volume_1hour_w5_expma', 'volume_1hour_w10_expma'],
         ['btc_close', 'btc_close_1hour_w5_expma', 'btc_close_1hour_w10_expma', 'btc_close_1hour_w20_expma', 'btc_close_1hour_w30_expma', 'btc_close_1hour_w60_expma', 'btc_close_1hour_w120_expma'],


         {'close' : ['close_1hour_w5_min', 'close_1hour_w10_min', 'close_1hour_w20_min', 'close_1hour_w30_min', 'close_1hour_w60_min', 'close_1hour_w120_min']},
         {'volume' : ['volume_1hour_w5_min', 'volume_1hour_w10_min']},
         {'btc_close' : ['btc_close_1hour_w5_min', 'btc_close_1hour_w10_min', 'btc_close_1hour_w20_min', 'btc_close_1hour_w30_min', 'btc_close_1hour_w60_min', 'btc_close_1hour_w120_min']},

    
         {'close' : ['close_1hour_w5_max', 'close_1hour_w10_max', 'close_1hour_w20_max', 'close_1hour_w30_max', 'close_1hour_w60_max', 'close_1hour_w120_max']},
         {'volume' : ['volume_1hour_w5_max', 'volume_1hour_w10_max']},
         {'btc_close' : ['btc_close_1hour_w5_max', 'btc_close_1hour_w10_max', 'btc_close_1hour_w20_max', 'btc_close_1hour_w30_max', 'btc_close_1hour_w60_max', 'btc_close_1hour_w120_max']},



        #w5
         {'close' : [  'close_1hour_w5_ma_low_2std', 'close_1hour_w5_ma_up_2std', 'close_1hour_w5_ma_low_3std', 'close_1hour_w5_ma_up_3std']},
         {'volume' : [  'volume_1hour_w5_ma_low_2std', 'volume_1hour_w5_ma_up_2std', 'volume_1hour_w5_ma_low_3std', 'volume_1hour_w5_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w5_ma_low_2std', 'btc_close_1hour_w5_ma_up_2std', 'btc_close_1hour_w5_ma_low_3std', 'btc_close_1hour_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1hour_w10_ma_low_2std', 'close_1hour_w10_ma_up_2std', 'close_1hour_w10_ma_low_3std', 'close_1hour_w10_ma_up_3std']},
         {'volume' : [  'volume_1hour_w10_ma_low_2std', 'volume_1hour_w10_ma_up_2std', 'volume_1hour_w10_ma_low_3std', 'volume_1hour_w10_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w10_ma_low_2std', 'btc_close_1hour_w10_ma_up_2std', 'btc_close_1hour_w10_ma_low_3std', 'btc_close_1hour_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1hour_w20_ma_low_2std', 'close_1hour_w20_ma_up_2std', 'close_1hour_w20_ma_low_3std', 'close_1hour_w20_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w20_ma_low_2std', 'btc_close_1hour_w20_ma_up_2std', 'btc_close_1hour_w20_ma_low_3std', 'btc_close_1hour_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1hour_w30_ma_low_2std', 'close_1hour_w30_ma_up_2std', 'close_1hour_w30_ma_low_3std', 'close_1hour_w30_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w30_ma_low_2std', 'btc_close_1hour_w30_ma_up_2std', 'btc_close_1hour_w30_ma_low_3std', 'btc_close_1hour_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1hour_w60_ma_low_2std', 'close_1hour_w60_ma_up_2std', 'close_1hour_w60_ma_low_3std', 'close_1hour_w60_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w60_ma_low_2std', 'btc_close_1hour_w60_ma_up_2std', 'btc_close_1hour_w60_ma_low_3std', 'btc_close_1hour_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1hour_w120_ma_low_2std', 'close_1hour_w120_ma_up_2std', 'close_1hour_w120_ma_low_3std', 'close_1hour_w120_ma_up_3std']},
         {'btc_close' : [  'btc_close_1hour_w120_ma_low_2std', 'btc_close_1hour_w120_ma_up_2std', 'btc_close_1hour_w120_ma_low_3std', 'btc_close_1hour_w120_ma_up_3std']},


        #w5
        ['close_1hour_w5_min', 'close_1hour_w5_max'],
        ['volume_1hour_w5_min', 'volume_1hour_w5_max'],
        ['btc_close_1hour_w5_min', 'btc_close_1hour_w5_max'],
        #w10
        ['close_1hour_w10_min', 'close_1hour_w10_max'],
        ['volume_1hour_w10_min', 'volume_1hour_w10_max'],
        ['btc_close_1hour_w10_min', 'btc_close_1hour_w10_max'],
        #w20
        ['close_1hour_w20_min', 'close_1hour_w20_max'],
        ['btc_close_1hour_w20_min', 'btc_close_1hour_w20_max'],
        #w30
        ['close_1hour_w30_min', 'close_1hour_w30_max'],
        ['btc_close_1hour_w30_min', 'btc_close_1hour_w30_max'],
        #w60
        ['close_1hour_w60_min', 'close_1hour_w60_max'],
        ['btc_close_1hour_w60_min', 'btc_close_1hour_w60_max'],
        #w120
        ['close_1hour_w120_min', 'close_1hour_w120_max'],
        ['btc_close_1hour_w120_min', 'btc_close_1hour_w120_max'],
]

In [242]:
groups_1day= [
        #windows: 
        #close: w1, w5, w10, w20, w30, w60, w120
        #volume: w1, w5, w10, w20
        #btc_close: w1, w5, w10, w20, w30, w60, w120
         ['close_1day_w1_roc', 'close_1day_w5_alpha', 'close_1day_w10_alpha', 'close_1day_w20_alpha', 'close_1day_w30_alpha', 'close_1day_w60_alpha', 'close_1day_w120_alpha'],
         ['volume_1day_w1_roc', 'volume_1day_w5_alpha', 'volume_1day_w10_alpha'],
         ['btc_close_1day_w1_roc', 'btc_close_1day_w5_alpha', 'btc_close_1day_w10_alpha', 'btc_close_1day_w20_alpha', 'btc_close_1day_w30_alpha', 'btc_close_1day_w60_alpha', 'btc_close_1day_w120_alpha'],
    
         ['close_1day_w1_roc', 'close_1day_w5_roc', 'close_1day_w10_roc', 'close_1day_w20_roc', 'close_1day_w30_roc', 'close_1day_w60_roc', 'close_1day_w120_roc'],
         ['volume_1day_w1_roc', 'volume_1day_w5_roc', 'volume_1day_w10_roc'],
         ['btc_close_1day_w1_roc', 'btc_close_1day_w5_roc', 'btc_close_1day_w10_roc', 'btc_close_1day_w20_roc', 'btc_close_1day_w30_roc', 'btc_close_1day_w60_roc', 'btc_close_1day_w120_roc'],

         ['close_1day_w5_mean_abs_pct', 'close_1day_w10_mean_abs_pct', 'close_1day_w20_mean_abs_pct', 'close_1day_w30_mean_abs_pct', 'close_1day_w60_mean_abs_pct', 'close_1day_w120_mean_abs_pct'],
         ['volume_1day_w5_mean_abs_pct', 'volume_1day_w10_mean_abs_pct'],
         ['btc_close_1day_w5_mean_abs_pct', 'btc_close_1day_w10_mean_abs_pct', 'btc_close_1day_w20_mean_abs_pct', 'btc_close_1day_w30_mean_abs_pct', 'btc_close_1day_w60_mean_abs_pct', 'btc_close_1day_w120_mean_abs_pct'],


         ['close_1day_w5_std', 'close_1day_w10_std', 'close_1day_w20_std', 'close_1day_w30_std', 'close_1day_w60_std', 'close_1day_w120_std'],
         ['volume_1day_w5_std', 'volume_1day_w10_std'],
         ['btc_close_1day_w5_std', 'btc_close_1day_w10_std', 'btc_close_1day_w20_std', 'btc_close_1day_w30_std', 'btc_close_1day_w60_std', 'btc_close_1day_w120_std'],


         ['close_1day_w5_norm_std', 'close_1day_w10_norm_std', 'close_1day_w20_norm_std', 'close_1day_w30_norm_std', 'close_1day_w60_norm_std', 'close_1day_w120_norm_std'],
         ['volume_1day_w5_norm_std', 'volume_1day_w10_norm_std'],
         ['btc_close_1day_w5_norm_std', 'btc_close_1day_w10_norm_std', 'btc_close_1day_w20_norm_std', 'btc_close_1day_w30_norm_std', 'btc_close_1day_w60_norm_std', 'btc_close_1day_w120_norm_std'],


         ['close_1day_w5_rsi', 'close_1day_w10_rsi', 'close_1day_w20_rsi', 'close_1day_w30_rsi', 'close_1day_w60_rsi', 'close_1day_w120_rsi'],
         ['volume_1day_w5_rsi', 'volume_1day_w10_rsi'],
         ['btc_close_1day_w5_rsi', 'btc_close_1day_w10_rsi', 'btc_close_1day_w20_rsi', 'btc_close_1day_w30_rsi', 'btc_close_1day_w60_rsi', 'btc_close_1day_w120_rsi'],


         ['close', 'close_1day_w5_ma', 'close_1day_w10_ma', 'close_1day_w20_ma', 'close_1day_w30_ma', 'close_1day_w60_ma', 'close_1day_w120_ma'],
         ['volume', 'volume_1day_w5_ma', 'volume_1day_w10_ma'],
         ['btc_close', 'btc_close_1day_w5_ma', 'btc_close_1day_w10_ma', 'btc_close_1day_w20_ma', 'btc_close_1day_w30_ma', 'btc_close_1day_w60_ma', 'btc_close_1day_w120_ma'],


         ['close', 'close_1day_w5_expma', 'close_1day_w10_expma', 'close_1day_w20_expma', 'close_1day_w30_expma', 'close_1day_w60_expma', 'close_1day_w120_expma'],
         ['volume', 'volume_1day_w5_expma', 'volume_1day_w10_expma',],
         ['btc_close', 'btc_close_1day_w5_expma', 'btc_close_1day_w10_expma', 'btc_close_1day_w20_expma', 'btc_close_1day_w30_expma', 'btc_close_1day_w60_expma', 'btc_close_1day_w120_expma'],


         {'close' : ['close_1day_w5_min', 'close_1day_w10_min', 'close_1day_w20_min', 'close_1day_w30_min', 'close_1day_w60_min', 'close_1day_w120_min']},
         {'volume' : ['volume_1day_w5_min', 'volume_1day_w10_min',]},
         {'btc_close' : ['btc_close_1day_w5_min', 'btc_close_1day_w10_min', 'btc_close_1day_w20_min', 'btc_close_1day_w30_min', 'btc_close_1day_w60_min', 'btc_close_1day_w120_min']},

    
         {'close' : ['close_1day_w5_max', 'close_1day_w10_max', 'close_1day_w20_max', 'close_1day_w30_max', 'close_1day_w60_max', 'close_1day_w120_max']},
         {'volume' : ['volume_1day_w5_max', 'volume_1day_w10_max',]},
         {'btc_close' : ['btc_close_1day_w5_max', 'btc_close_1day_w10_max', 'btc_close_1day_w20_max', 'btc_close_1day_w30_max', 'btc_close_1day_w60_max', 'btc_close_1day_w120_max']},



        #w5
         {'close' : [  'close_1day_w5_ma_low_2std', 'close_1day_w5_ma_up_2std', 'close_1day_w5_ma_low_3std', 'close_1day_w5_ma_up_3std']},
         {'volume' : [  'volume_1day_w5_ma_low_2std', 'volume_1day_w5_ma_up_2std', 'volume_1day_w5_ma_low_3std', 'volume_1day_w5_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w5_ma_low_2std', 'btc_close_1day_w5_ma_up_2std', 'btc_close_1day_w5_ma_low_3std', 'btc_close_1day_w5_ma_up_3std']},
        #w10
         {'close' : [  'close_1day_w10_ma_low_2std', 'close_1day_w10_ma_up_2std', 'close_1day_w10_ma_low_3std', 'close_1day_w10_ma_up_3std']},
         {'volume' : [  'volume_1day_w10_ma_low_2std', 'volume_1day_w10_ma_up_2std', 'volume_1day_w10_ma_low_3std', 'volume_1day_w10_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w10_ma_low_2std', 'btc_close_1day_w10_ma_up_2std', 'btc_close_1day_w10_ma_low_3std', 'btc_close_1day_w10_ma_up_3std']},
        #w20
         {'close' : [  'close_1day_w20_ma_low_2std', 'close_1day_w20_ma_up_2std', 'close_1day_w20_ma_low_3std', 'close_1day_w20_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w20_ma_low_2std', 'btc_close_1day_w20_ma_up_2std', 'btc_close_1day_w20_ma_low_3std', 'btc_close_1day_w20_ma_up_3std']},
        #w30
         {'close' : [  'close_1day_w30_ma_low_2std', 'close_1day_w30_ma_up_2std', 'close_1day_w30_ma_low_3std', 'close_1day_w30_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w30_ma_low_2std', 'btc_close_1day_w30_ma_up_2std', 'btc_close_1day_w30_ma_low_3std', 'btc_close_1day_w30_ma_up_3std']},
        #w60
         {'close' : [  'close_1day_w60_ma_low_2std', 'close_1day_w60_ma_up_2std', 'close_1day_w60_ma_low_3std', 'close_1day_w60_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w60_ma_low_2std', 'btc_close_1day_w60_ma_up_2std', 'btc_close_1day_w60_ma_low_3std', 'btc_close_1day_w60_ma_up_3std']},
        #w120
         {'close' : [  'close_1day_w120_ma_low_2std', 'close_1day_w120_ma_up_2std', 'close_1day_w120_ma_low_3std', 'close_1day_w120_ma_up_3std']},
         {'btc_close' : [  'btc_close_1day_w120_ma_low_2std', 'btc_close_1day_w120_ma_up_2std', 'btc_close_1day_w120_ma_low_3std', 'btc_close_1day_w120_ma_up_3std']},


        #w5
        ['close_1day_w5_min', 'close_1day_w5_max'],
        ['volume_1day_w5_min', 'volume_1day_w5_max'],
        ['btc_close_1day_w5_min', 'btc_close_1day_w5_max'],
        #w10
        ['close_1day_w10_min', 'close_1day_w10_max'],
        ['volume_1day_w10_min', 'volume_1day_w10_max'],
        ['btc_close_1day_w10_min', 'btc_close_1day_w10_max'],
        #w20
        ['close_1day_w20_min', 'close_1day_w20_max'],
        ['btc_close_1day_w20_min', 'btc_close_1day_w20_max'],
        #w30
        ['close_1day_w30_min', 'close_1day_w30_max'],
        ['btc_close_1day_w30_min', 'btc_close_1day_w30_max'],
        #w60
        ['close_1day_w60_min', 'close_1day_w60_max'],
        ['btc_close_1day_w60_min', 'btc_close_1day_w60_max'],
        #w120
        ['close_1day_w120_min', 'close_1day_w120_max'],
        ['btc_close_1day_w120_min', 'btc_close_1day_w120_max'],
]

In [245]:
def uniq_pairs(cols):
    pairs = []
    for i in range(len(cols)-1):
        for j in range(i+1, len(cols)):
            pairs += [(cols[i], cols[j])]
    return pairs

def calc_relative_features(df, groups):
    for group in tqdm(groups):
        if type(group) == list:
            pairs = uniq_pairs(group)
            for pair in pairs:
                new_col = f'{pair[0]}/{pair[1]}'
                df[new_col] = df[pair[0]] / (df[pair[1]] + np.finfo(np.float32).eps)

        if type(group) == dict:
            pair1 = list(group.keys())[0]
            for pair0 in group[pair1]:
                new_col = f'{pair0}/{pair1}'
                df[new_col] = df[pair0] / (df[pair1] + np.finfo(np.float32).eps)

    return df

In [247]:
import warnings
warnings.filterwarnings('ignore')

In [250]:
print(df.shape)
calc_relative_features(df, groups_5min)
df.shape

(2323584, 764)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:03<00:00, 17.70it/s]


(2323584, 1166)

In [253]:
print(df.shape)
calc_relative_features(df, groups_1hour)
df.shape

(2323584, 1166)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:03<00:00, 17.09it/s]


(2323584, 1568)

In [256]:
print(df.shape)
calc_relative_features(df, groups_1day)
df.shape

(2323584, 1568)


100%|█████████████████████████████████████████████████████████████████████████| 58/58 [00:02<00:00, 20.10it/s]


(2323584, 1970)

In [260]:
for col in tqdm(df.columns):
    assert df[col].isnull().sum() == 0, f'Nulls {col}'

100%|█████████████████████████████████████████████████████████████████████| 1970/1970 [01:17<00:00, 25.48it/s]


### Delete absolute valu columns

In [264]:
len(cols_del_5min), len(cols_del_1hour), len(cols_del_1day)

(126, 126, 126)

In [266]:
print(df.shape)
for col in cols_del_5min+cols_del_1hour+cols_del_1day:
    del df[col]
df.shape

(2323584, 1970)


(2323584, 1592)

## 2.6 Save data

In [273]:
!mkdir data/feat_engin/lgbm

mkdir: data/feat_engin/lgbm: File exists


In [275]:
#save
dump_pkl(df, 'data/feat_engin/lgbm/data_5min_1hour_1day.pkl')

In [276]:
df.shape

(2323584, 1592)

In [277]:
df.head()

Unnamed: 0,time,close,volume,ticker,btc_close,index_1hour,index_1day,result,delta_time,income_rate,...,volume_1day_w10_min/volume_1day_w10_max,btc_close_1day_w10_min/btc_close_1day_w10_max,close_1day_w20_min/close_1day_w20_max,btc_close_1day_w20_min/btc_close_1day_w20_max,close_1day_w30_min/close_1day_w30_max,btc_close_1day_w30_min/btc_close_1day_w30_max,close_1day_w60_min/close_1day_w60_max,btc_close_1day_w60_min/btc_close_1day_w60_max,close_1day_w120_min/close_1day_w120_max,btc_close_1day_w120_min/btc_close_1day_w120_max
0,2022-11-03 00:00:00,22.83,5324.912598,ETC-USDT,20161.699219,2999.0,123.0,DNF,0 days 04:00:00,1.018193,...,0.248473,0.929076,0.837558,0.915216,0.776918,0.915216,0.54627,0.824325,0.31748,0.755364
1,2022-11-03 00:05:00,22.825001,6226.672852,ETC-USDT,20139.900391,2999.0,123.0,DNF,0 days 04:00:00,1.017146,...,0.248473,0.929076,0.837558,0.915216,0.776918,0.915216,0.54627,0.824325,0.31748,0.755364
2,2022-11-03 00:10:00,22.802999,1182.641846,ETC-USDT,20150.099609,2999.0,123.0,DNF,0 days 04:00:00,1.019006,...,0.248473,0.929076,0.837558,0.915216,0.776918,0.915216,0.54627,0.824325,0.31748,0.755364
3,2022-11-03 00:15:00,22.784,1238.016602,ETC-USDT,20163.699219,2999.0,123.0,DNF,0 days 04:00:00,1.020472,...,0.248473,0.929076,0.837558,0.915216,0.776918,0.915216,0.54627,0.824325,0.31748,0.755364
4,2022-11-03 00:25:00,22.872999,6041.620605,ETC-USDT,20101.599609,2999.0,123.0,DNF,0 days 04:00:00,1.016362,...,0.248473,0.929076,0.837558,0.915216,0.776918,0.915216,0.54627,0.824325,0.31748,0.755364


In [None]:
#df.columns.tolist()

In [278]:
no_features = [
 'time',
 'close',
 'volume',
 'btc_close',

 'ticker',

 'index_1hour',
 'index_1day',
 'result',
 'delta_time',
 'income_rate',
 'res_price',
 'res_ind',

'time_1hour',
 'close_1hour',
 'volume_1hour',
 'btc_close_1hour',

'time_1day',
 'close_1day',
 'volume_1day',
 'btc_close_1day',]


### 2.7 Save data to NN

In [128]:
#don't use NN

In [131]:
# df_fe.head()

In [133]:
# df_1day_fe.head()

In [135]:
# df_fe.shape, df_1day_fe.shape

In [137]:
# dump_pkl(df_fe, 'data/feat_engin/lgbm/data_1hour.pkl')
# dump_pkl(df_1day_fe, 'data/feat_engin/lgbm/data_1day.pkl')

# TMP

In [328]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os

from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

In [330]:
df = load_pkl('data/feat_engin/lgbm/data_5min_1hour_1day.pkl')

In [None]:
df

#### uniq_1

In [279]:
no_analyze = [
 'time',
 # 'close',
 #'volume',
 #'btc_close',

 'ticker',

 #'index_1hour',
 #'index_1day',
 'result',
 'delta_time',
 #'income_rate',
 #'res_price',
 #'res_ind',

 'time_1hour',
 #'close_1hour',
 #'volume_1hour',
 #'btc_close_1hour',

 'time_1day',
 #'close_1day',
 #'volume_1day',
 #'btc_close_1day',
]

In [281]:
# df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].mean().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].mean()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 1586/1586 [01:14<00:00, 21.33it/s]


Unnamed: 0,index,0
0,close,37.984493
1,volume,152452.921875
2,btc_close,49723.546875
3,index_1hour,224976.713657
4,index_1day,9373.087177
...,...,...
1581,btc_close_1day_w30_min/btc_close_1day_w30_max,0.836011
1582,close_1day_w60_min/close_1day_w60_max,0.643641
1583,btc_close_1day_w60_min/btc_close_1day_w60_max,0.757959
1584,close_1day_w120_min/close_1day_w120_max,0.528726


In [282]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [283]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1]

Series([], Name: index, dtype: object)

In [284]:
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].values

array([], dtype=object)

In [None]:
#df[['volume/volume_1hour_w5_ma', 'volume/volume_1hour_w5_expma']]

#### uniq_2

In [285]:
#df_tmp = df[df.columns[~df.columns.isin(no_analyze)]].nunique().reset_index()

df_tmp = []
for col in tqdm(df.columns[~df.columns.isin(no_analyze)]):
    df_tmp += [df[col].nunique()]

df_tmp = pd.DataFrame({'index' : df.columns[~df.columns.isin(no_analyze)],
                      0 : df_tmp})

df_tmp

100%|█████████████████████████████████████████████████████████████████████| 1586/1586 [01:53<00:00, 13.92it/s]


Unnamed: 0,index,0
0,close,167702
1,volume,2278768
2,btc_close,169862
3,index_1hour,387194
4,index_1day,16156
...,...,...
1581,btc_close_1day_w30_min/btc_close_1day_w30_max,305
1582,close_1day_w60_min/close_1day_w60_max,4137
1583,btc_close_1day_w60_min/btc_close_1day_w60_max,240
1584,close_1day_w120_min/close_1day_w120_max,2871


In [286]:
df_tmp[df_tmp[0].isnull()]

Unnamed: 0,index,0


In [287]:
pd.set_option('display.max_rows', 120)
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].reset_index()

Unnamed: 0,0,index
0,11,"[btc_close_1day_w30_lvl_1.02-1.03, btc_close_1..."
1,12,"[close_1day_w30_lvl_1.03-1.04, close_1day_w30_..."
2,13,"[close_1day_w30_lvl_1.04-1.05, close_1day_w30_..."
3,14,"[close_1day_w30_lvl_-0.99-1, close_1day_w30_lv..."
4,15,"[close_1day_w30_lvl_-0.98-0.99, btc_close_1day..."
...,...,...
126,2321645,"[volume_w5_ma_low_2std/volume, volume_w5_ma_up..."
127,2321994,"[volume_w10_ma_low_2std/volume, volume_w10_ma_..."
128,2322443,"[close_w20_alpha/close_w60_alpha, close_w20_al..."
129,2322598,"[close_w30_alpha/close_w60_alpha, close_w30_al..."


In [289]:
i = 43
df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].index[i], df_tmp.groupby(0).index.unique()[df_tmp.groupby(0).index.nunique()>1].iloc[i]

(16156,
 array(['index_1day', 'volume_1day_w1_roc', 'volume_1day_w5_alpha',
        'volume_1day_w10_alpha', 'close_1day_w60_alpha',
        'close_1day_w120_alpha',
        'close_1day_w5_alpha/close_1day_w10_alpha',
        'close_1day_w5_alpha/close_1day_w20_alpha',
        'close_1day_w5_alpha/close_1day_w30_alpha',
        'close_1day_w5_alpha/close_1day_w60_alpha',
        'close_1day_w5_alpha/close_1day_w120_alpha',
        'close_1day_w10_alpha/close_1day_w20_alpha',
        'close_1day_w10_alpha/close_1day_w30_alpha',
        'close_1day_w10_alpha/close_1day_w60_alpha',
        'close_1day_w10_alpha/close_1day_w120_alpha',
        'close_1day_w20_alpha/close_1day_w30_alpha',
        'close_1day_w20_alpha/close_1day_w60_alpha',
        'close_1day_w20_alpha/close_1day_w120_alpha',
        'close_1day_w30_alpha/close_1day_w60_alpha',
        'close_1day_w30_alpha/close_1day_w120_alpha',
        'close_1day_w60_alpha/close_1day_w120_alpha',
        'volume_1day_w1_roc/volume_1day

In [292]:
np.sort(df['close_1day_w120_mean_abs_pct'].unique())

array([0.01082697, 0.01085083, 0.01090025, ..., 0.06862602, 0.06863569,
       0.06913449], dtype=float32)