In [None]:
#!pip3 install torch torchvision torchaudio

In [None]:
#!python3 -m pip install tensorflow

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


from sklearn.metrics import roc_auc_score

import pickle
def dump_pkl(data, filename):
  with open(filename, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pkl(filename):
  with open(filename, 'rb') as handle:
    data = pickle.load(handle)
  return data

### 1. Load data

In [4]:
dfs_1min = []
dfs_5min = []
dfs_1hour = []
dfs_1day = []

stocks = [
          'GAZP',
          'SBER',
          'LKOH',
          'MGNT',
          'NVTK',
          'SNGS',
          'GMKN',
          'ROSN',
          'NLMK',
          'TATN',
          'MTSS',
          'ALRS',  
          'YDEX',
          'CHMF',
          
          'MAGN',
          'TCSG',
          'OZON',
          'RUAL'  
         ]
for stock in tqdm(stocks):
    df_1min = load_pkl(f"./data/preproc/1min/{stock}.pkl")
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")
    
    dfs_1min += [df_1min.copy()]
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_1min = pd.concat(dfs_1min)
df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1min.reset_index(drop=True, inplace=True)
df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1min.shape, df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 74.52it/s]


((7847346, 7), (1607835, 7), (138247, 7), (9724, 7))

In [5]:
df_1min.head()

Unnamed: 0,time,open,close,volume,low,high,ticker
0,2022-10-03 10:00:00,219.0,219.93,87793.0,218.0,220.25,GAZP
1,2022-10-03 10:01:00,219.8,220.8,61549.0,219.7,220.55,GAZP
2,2022-10-03 10:02:00,220.24,219.34,52640.0,219.2,220.38,GAZP
3,2022-10-03 10:03:00,219.17,218.96,61506.0,217.7,219.4,GAZP
4,2022-10-03 10:04:00,218.82,218.3,51464.0,217.76,218.96,GAZP


### 2. Preproc data

#### 2.1 Make target

In [6]:
# Используем уже знакомую функцию :)

def get_target(df_all, ind, val_first, val_second, days_to_wait):
    #цель - достичь val_second, не достигнув val_first

    points_in_day = 840 # торговая сессия
    points_to_wait = days_to_wait * points_in_day
    ind_end = min(ind+points_to_wait+1, df_all.shape[0])
    
    df = df_all.iloc[ind:ind_end].copy()
    
    mask_stock = np.array(df['ticker'] == df['ticker'].iloc[0])
    df = df.loc[mask_stock, :]
    
    if val_first < val_second:  
        mask_val_first = np.array(df['close'] < val_first)
        mask_val_second = np.array(df['close'] > val_second)
    else:
        mask_val_first = np.array(df['close'] > val_first)
        mask_val_second = np.array(df['close'] < val_second)

    
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() == 0):
        return 'DNF', -1
        
    if (mask_val_first.sum() == 0) & (mask_val_second.sum() != 0):
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
        return 'WIN', delta_time
        
    if (mask_val_first.sum() != 0) & (mask_val_second.sum() == 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
        return 'LOSE', delta_time

    if (mask_val_first.sum() != 0) & (mask_val_second.sum() != 0):
        ind_val_first = np.argwhere(mask_val_first).ravel()[0]
        ind_val_second = np.argwhere(mask_val_second).ravel()[0]
        if ind_val_first < ind_val_second:
            delta_time = df['time'].iloc[ind_val_first] - df['time'].iloc[0]
            return 'LOSE', delta_time
        if ind_val_first > ind_val_second:
            delta_time = df['time'].iloc[ind_val_second] - df['time'].iloc[0]
            return 'WIN', delta_time


def get_df_target(df, indx, percent_first=None, percent_second=None, days_to_wait=None):
    times = []
    results = []
    delta_times = []
    closes = []
    tickers = []
    
    for ind in tqdm(indx):
        time = df['time'].iloc[ind]
        close = df['close'].iloc[ind]
        ticker = df['ticker'].iloc[ind]
        
        val_first = df['close'].iloc[ind] * percent_first
        val_second = df['close'].iloc[ind] * percent_second
        result, delta_time = get_target(df, ind, val_first, val_second, days_to_wait)
        
        times += [time]
        closes += [close]
        tickers += [ticker]
        results += [result]
        delta_times += [delta_time]
        
        

    df_result = pd.DataFrame({'ind' : indx,
                              'time' : times,
                              'close' : closes,
                              'result' : results,
                              'ticker' : tickers, 
                              'delta_time' : delta_times
                             })
    return df_result    

In [7]:
inds = np.arange(df_1min.shape[0])
inds.shape

(7847346,)

In [8]:
df_result = get_df_target(df_1min, inds, percent_first=0.995, percent_second=1.015, days_to_wait=1)

100%|█████████████████████████████████████████████████████████████| 7847346/7847346 [31:43<00:00, 4122.24it/s]


In [9]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,-1
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,-1
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,-1
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,-1


In [10]:
df_result['result'].value_counts(normalize=True)

result
LOSE    0.614319
WIN     0.246852
DNF     0.138829
Name: proportion, dtype: float64

In [11]:
(df_result['ind'] == df_result.index).all(), 

(True,)

In [12]:
!mkdir experiments

mkdir: experiments: File exists


In [13]:
!mkdir experiments/model_one_fc

mkdir: experiments/model_one_fc: File exists


In [14]:
dump_pkl(df_result, './experiments/model_one_fc/df_result_wait_1day.pkl')

#### 2.1.2 Загрузим датасет с ожиданием в 1 день и сменим таргет на вызоевание дадим 4 часа (скальпинг)

In [4]:
df_result = load_pkl('./experiments/model_one_fc/df_result_wait_1day.pkl')

In [6]:
df_result

Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,-1
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,-1
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,-1
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,-1


In [10]:
df_result['delta_time'] = df_result['delta_time'].replace(-1, pd.Timedelta('100 days'))
df_result

  df_result['delta_time'] = df_result['delta_time'].replace(-1, pd.Timedelta('100 days'))


Unnamed: 0,ind,time,close,result,ticker,delta_time
0,0,2022-10-03 10:00:00,219.930,LOSE,GAZP,0 days 00:04:00
1,1,2022-10-03 10:01:00,220.800,LOSE,GAZP,0 days 00:01:00
2,2,2022-10-03 10:02:00,219.340,LOSE,GAZP,0 days 00:03:00
3,3,2022-10-03 10:03:00,218.960,LOSE,GAZP,0 days 00:02:00
4,4,2022-10-03 10:04:00,218.300,LOSE,GAZP,0 days 00:07:00
...,...,...,...,...,...,...
7847341,7847341,2024-11-19 23:45:00,38.455,DNF,RUAL,100 days 00:00:00
7847342,7847342,2024-11-19 23:46:00,38.510,DNF,RUAL,100 days 00:00:00
7847343,7847343,2024-11-19 23:47:00,38.520,DNF,RUAL,100 days 00:00:00
7847344,7847344,2024-11-19 23:48:00,38.520,DNF,RUAL,100 days 00:00:00


In [11]:
mask = df_result['delta_time'] > pd.Timedelta('4 hours')
df_result.loc[mask, 'result'] = 'DNF'


In [14]:
df_result['result'].value_counts(normalize=True)

result
DNF     0.610235
LOSE    0.307435
WIN     0.082329
Name: proportion, dtype: float64

#### 2.2 Link data of different time-period

In [15]:
dfs_1min = []
dfs_5min = []
dfs_1hour = []
dfs_1day = []

stocks = [
          'GAZP',
          'SBER',
          'LKOH',
          'MGNT',
          'NVTK',
          'SNGS',
          'GMKN',
          'ROSN',
          'NLMK',
          'TATN',
          'MTSS',
          'ALRS',  
          'YDEX',
          'CHMF',
          
          'MAGN',
          'TCSG',
          'OZON',
          'RUAL'  
         ]
for stock in tqdm(stocks):
    df_1min = load_pkl(f"./data/preproc/1min/{stock}.pkl")
    df_5min = load_pkl(f"./data/preproc/5min/{stock}.pkl")
    df_1hour = load_pkl(f"./data/preproc/1hour/{stock}.pkl")
    df_1day = load_pkl(f"./data/preproc/1day/{stock}.pkl")

    df_1min['index_1min_start'] = None
    df_1min.loc[0, 'index_1min_start'] = 42
    df_5min['index_start'] = None
    df_5min.loc[0, 'index_start'] = 42
    df_1hour['index_start'] = None
    df_1hour.loc[0, 'index_start'] = 42
    df_1day['index_start'] = None
    df_1day.loc[0, 'index_start'] = 42

    dfs_1min += [df_1min.copy()]
    dfs_5min += [df_5min.copy()]
    dfs_1hour += [df_1hour.copy()]
    dfs_1day += [df_1day.copy()]
    



df_1min = pd.concat(dfs_1min)
df_5min = pd.concat(dfs_5min)
df_1day = pd.concat(dfs_1day)
df_1hour = pd.concat(dfs_1hour)

df_1min.reset_index(drop=True, inplace=True)
df_5min.reset_index(drop=True, inplace=True)
df_1hour.reset_index(drop=True, inplace=True)
df_1day.reset_index(drop=True, inplace=True)

df_1min.shape, df_5min.shape, df_1hour.shape, df_1day.shape

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 30.80it/s]


((7847346, 8), (1607835, 8), (138247, 8), (9724, 8))

In [16]:
pd.set_option('future.no_silent_downcasting', True)

#start index in each df-time
#1min
mask = df_1min['index_1min_start'] == 42
df_1min.loc[mask, 'index_1min_start'] = df_1min.index[mask]
df_1min['index_1min_start'] = df_1min['index_1min_start'].ffill()

#5min
mask = df_5min['index_start'] == 42
df_5min.loc[mask, 'index_start'] = df_5min.index[mask]
df_5min['index_start'] = df_5min['index_start'].ffill()

#1hour
mask = df_1hour['index_start'] == 42
df_1hour.loc[mask, 'index_start'] = df_1hour.index[mask]
df_1hour['index_start'] = df_1hour['index_start'].ffill()

#1day
mask = df_1day['index_start'] == 42
df_1day.loc[mask, 'index_start'] = df_1day.index[mask]
df_1day['index_start'] = df_1day['index_start'].ffill()

In [17]:
#link different time-step date

df_5min = df_5min.reset_index()
df_1hour = df_1hour.reset_index()
df_1day = df_1day.reset_index()

df_5min = df_5min.rename(columns={col : col+'_5min' for col in df_5min.columns if col not in ['time', 'ticker']})
df_1hour = df_1hour.rename(columns={col : col+'_1hour' for col in df_1hour.columns if col not in ['time', 'ticker']})
df_1day = df_1day.rename(columns={col : col+'_1day' for col in df_1day.columns if col not in ['time', 'ticker']})

df_1min['time'] += pd.Timedelta('1min')
df_5min['time'] += pd.Timedelta('5min')
df_1hour['time'] += pd.Timedelta('1hour')
#1) биржа работает до 23:50 
mask = datetime.time(0, 0) == pd.to_datetime(df_1hour['time'], format='%H:%M').dt.time
df_1hour.loc[mask, 'time'] -= pd.Timedelta('10min')
#2)и клирингс 18:50 до 19:05
mask = datetime.time(19, 0) == pd.to_datetime(df_1hour['time'], format='%H:%M').dt.time
df_1hour.loc[mask, 'time'] -= pd.Timedelta('10min')
# end
df_1day['time'] += pd.Timedelta('20:50:00')


df = df_1min.merge(df_5min, on=['time', 'ticker'], how='left')
df = df.merge(df_1hour, on=['time', 'ticker'], how='left')
df = df.merge(df_1day, on=['time', 'ticker'], how='left')

#Проверка, что данные все данные подтянулись
print(f'Пересесчение в данных 1min и 5min: {df['close_5min'].notnull().sum() / df_5min.shape[0]}')
print(f'Пересесчение в данных 1min и 1hour: {df['close_1hour'].notnull().sum() / df_1hour.shape[0]}')
print(f'Пересесчение в данных 1min и 1day: {df['close_1day'].notnull().sum() / df_1day.shape[0]}')
#в минутных данных бывают пропуски, это можно проинтерполировать. Но пока просто игнорируем, вроде как несущественный эффект должен быть

#Проверка на равенство значений
count_diff = ((df['close_5min'].notnull()) & (df['close'] != df['close_5min'])).sum()
assert count_diff == 0, f'Error: close & close_5min, count diff values: {count_diff}' 
count_diff = ((df['close_1hour'].notnull()) & (df['close'] != df['close_1hour'])).sum()
assert count_diff < 200, f'Error: close & close_1hour, count diff values: {count_diff}' #тут есть празничные торги они идут без вечернего клиринга и без вечерних торгов
count_diff = ((df['close_1day'].notnull()) & (df['close'] != df['close_1day'])).sum()
assert count_diff == 0, f'Error: close & close_1day, count diff values: {count_diff}' 

#Проверка что не наджоинилось лишнего
assert df_1min.shape[0] == df.shape[0], 'Error: with join dimensions'

Пересесчение в данных 1min и 5min: 0.9804569498735878
Пересесчение в данных 1min и 1hour: 0.9674929654893054
Пересесчение в данных 1min и 1day: 0.989613327848622


In [18]:
for stock in tqdm(stocks):
    mask_stock = df['ticker'] == stock
    
    cols_ffill = ['index_1min_start', 'index_start_5min', 'index_5min', 'index_start_1hour', 'index_1hour', 'index_start_1day', 'index_1day']
    df.loc[mask_stock, cols_ffill] = df.loc[mask_stock, cols_ffill].ffill()

100%|█████████████████████████████████████████████████████████████████████████| 18/18 [00:10<00:00,  1.79it/s]


In [19]:
df

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,low_1hour,high_1hour,index_start_1hour,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,index_start_1day
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,...,,,,,,,,,,
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,...,,,,,,,,,,
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,...,,,,,,,,,,
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,...,,,,,,,,,,
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,219.000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,,...,,,130521,9722.0,,,,,,9181


#### 2.4 Union target and features. Make file to train

In [20]:
(df_result['time']+pd.Timedelta('1min') == df['time']).all(), (df_result['close'] == df['close']).all()

(True, True)

In [21]:
#union
df = pd.concat([df, df_result[['result', 'delta_time']]], axis=1)
df

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,open_5min,...,index_start_1hour,index_1day,open_1day,close_1day,volume_1day,low_1day,high_1day,index_start_1day,result,delta_time
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:04:00
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:01:00
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:03:00
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,...,,,,,,,,,LOSE,0 days 00:02:00
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,219.000,...,,,,,,,,,LOSE,0 days 00:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,,...,130521,9722.0,,,,,,9181,DNF,100 days 00:00:00


In [22]:
df.columns

Index(['time', 'open', 'close', 'volume', 'low', 'high', 'ticker',
       'index_1min_start', 'index_5min', 'open_5min', 'close_5min',
       'volume_5min', 'low_5min', 'high_5min', 'index_start_5min',
       'index_1hour', 'open_1hour', 'close_1hour', 'volume_1hour', 'low_1hour',
       'high_1hour', 'index_start_1hour', 'index_1day', 'open_1day',
       'close_1day', 'volume_1day', 'low_1day', 'high_1day',
       'index_start_1day', 'result', 'delta_time'],
      dtype='object')

In [23]:
need_cols = ['time', 'open', 'close', 'volume', 'low', 'high', 'ticker',
             'index_1min_start',
             'index_5min', 'index_start_5min',
             'index_1hour', 'index_start_1hour',
             'index_1day', 'index_start_1day',
             'result', 'delta_time' 
            ]
df[need_cols]

Unnamed: 0,time,open,close,volume,low,high,ticker,index_1min_start,index_5min,index_start_5min,index_1hour,index_start_1hour,index_1day,index_start_1day,result,delta_time
0,2022-10-03 10:01:00,219.000,219.930,87793.0,218.000,220.250,GAZP,0,,,,,,,LOSE,0 days 00:04:00
1,2022-10-03 10:02:00,219.800,220.800,61549.0,219.700,220.550,GAZP,0,,,,,,,LOSE,0 days 00:01:00
2,2022-10-03 10:03:00,220.240,219.340,52640.0,219.200,220.380,GAZP,0,,,,,,,LOSE,0 days 00:03:00
3,2022-10-03 10:04:00,219.170,218.960,61506.0,217.700,219.400,GAZP,0,,,,,,,LOSE,0 days 00:02:00
4,2022-10-03 10:05:00,218.820,218.300,51464.0,217.760,218.960,GAZP,0,0.0,0,,,,,LOSE,0 days 00:07:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7847341,2024-11-19 23:46:00,38.485,38.455,1020.0,38.455,38.490,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847342,2024-11-19 23:47:00,38.490,38.510,1102.0,38.490,38.510,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847343,2024-11-19 23:48:00,38.500,38.520,665.0,38.475,38.520,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00
7847344,2024-11-19 23:49:00,38.520,38.520,781.0,38.520,38.530,RUAL,7410480,1607833.0,1517952,138245.0,130521,9722.0,9181,DNF,100 days 00:00:00


In [24]:
!mkdir experiments/model_one_fc/data

mkdir: experiments/model_one_fc/data: File exists


In [25]:
#save
dump_pkl(df[need_cols], 'experiments/model_one_fc/data/data_shift_time_1min.pkl')

In [26]:
#5min

In [27]:
df_5min

Unnamed: 0,index_5min,time,open_5min,close_5min,volume_5min,low_5min,high_5min,ticker,index_start_5min
0,0,2022-10-03 10:05:00,219.000,218.300,314952.0,217.700,220.550,GAZP,0
1,1,2022-10-03 10:10:00,218.300,218.500,182525.0,216.190,218.780,GAZP,0
2,2,2022-10-03 10:15:00,218.110,215.860,193758.0,215.310,218.130,GAZP,0
3,3,2022-10-03 10:20:00,215.830,216.850,91747.0,215.700,217.390,GAZP,0
4,4,2022-10-03 10:25:00,216.850,216.580,52824.0,216.400,217.160,GAZP,0
...,...,...,...,...,...,...,...,...,...
1607830,1607830,2024-11-19 23:30:00,38.470,38.465,795.0,38.465,38.470,RUAL,1517952
1607831,1607831,2024-11-19 23:35:00,38.465,38.440,2684.0,38.435,38.470,RUAL,1517952
1607832,1607832,2024-11-19 23:40:00,38.445,38.440,1072.0,38.435,38.445,RUAL,1517952
1607833,1607833,2024-11-19 23:45:00,38.440,38.485,7204.0,38.435,38.490,RUAL,1517952


In [28]:
df_5min.columns

Index(['index_5min', 'time', 'open_5min', 'close_5min', 'volume_5min',
       'low_5min', 'high_5min', 'ticker', 'index_start_5min'],
      dtype='object')

In [29]:
need_cols = ['time', 'open_5min', 'close_5min', 'volume_5min',
       'low_5min', 'high_5min', 'ticker']
dump_pkl(df_5min[need_cols], 'experiments/model_one_fc/data/data_shift_time_5min.pkl')

In [30]:
#1hour

In [31]:
df_1hour

Unnamed: 0,index_1hour,time,open_1hour,close_1hour,volume_1hour,low_1hour,high_1hour,ticker,index_start_1hour
0,0,2022-10-03 11:00:00,219.000,215.800,1623411.0,213.630,220.550,GAZP,0
1,1,2022-10-03 12:00:00,215.800,216.360,411786.0,214.960,216.780,GAZP,0
2,2,2022-10-03 13:00:00,216.340,215.880,237084.0,215.430,216.430,GAZP,0
3,3,2022-10-03 14:00:00,215.850,217.100,573509.0,215.530,217.770,GAZP,0
4,4,2022-10-03 15:00:00,217.120,216.940,364609.0,216.500,217.490,GAZP,0
...,...,...,...,...,...,...,...,...,...
138242,138242,2024-11-19 20:00:00,38.470,38.555,81823.0,38.435,38.720,RUAL,130521
138243,138243,2024-11-19 21:00:00,38.560,38.710,68970.0,38.440,38.710,RUAL,130521
138244,138244,2024-11-19 22:00:00,38.700,38.570,35080.0,38.570,38.725,RUAL,130521
138245,138245,2024-11-19 23:00:00,38.585,38.540,18640.0,38.515,38.605,RUAL,130521


In [32]:
df_1hour.columns

Index(['index_1hour', 'time', 'open_1hour', 'close_1hour', 'volume_1hour',
       'low_1hour', 'high_1hour', 'ticker', 'index_start_1hour'],
      dtype='object')

In [33]:
need_cols = ['time', 'open_1hour', 'close_1hour', 'volume_1hour',
       'low_1hour', 'high_1hour', 'ticker']
dump_pkl(df_1hour[need_cols], 'experiments/model_one_fc/data/data_shift_time_1hour.pkl')

In [34]:
#1day

In [35]:
df_1day

Unnamed: 0,index_1day,time,open_1day,close_1day,volume_1day,low_1day,high_1day,ticker,index_start_1day
0,0,2022-09-30 23:50:00,231.680,217.700,24588842.0,189.420,238.720,GAZP,0
1,1,2022-10-03 23:50:00,219.000,215.830,5070201.0,213.630,220.550,GAZP,0
2,2,2022-10-04 23:50:00,216.480,210.720,4975859.0,208.800,216.700,GAZP,0
3,3,2022-10-05 23:50:00,211.500,209.500,5322122.0,202.850,211.500,GAZP,0
4,4,2022-10-06 23:50:00,210.000,212.860,5165564.0,209.600,216.880,GAZP,0
...,...,...,...,...,...,...,...,...,...
9719,9719,2024-11-13 23:50:00,37.900,37.605,1378504.0,36.915,38.200,RUAL,9181
9720,9720,2024-11-14 23:50:00,37.650,36.910,1071422.0,36.700,37.945,RUAL,9181
9721,9721,2024-11-15 23:50:00,36.850,39.650,8844376.0,36.715,39.690,RUAL,9181
9722,9722,2024-11-18 23:50:00,38.955,38.985,2859643.0,38.770,39.635,RUAL,9181


In [36]:
df_1day.columns

Index(['index_1day', 'time', 'open_1day', 'close_1day', 'volume_1day',
       'low_1day', 'high_1day', 'ticker', 'index_start_1day'],
      dtype='object')

In [37]:
need_cols = ['time', 'open_1day', 'close_1day', 'volume_1day',
       'low_1day', 'high_1day', 'ticker']
dump_pkl(df_1day[need_cols], 'experiments/model_one_fc/data/data_shift_time_1day.pkl')