In [28]:
import numpy as np

## Converting time in MOEX

In [63]:
def convert_time(time_str):
    # Время могло быть прочитано как int, преобразуем его в строку
    time_str = str(time_str)
    # Время заполнено слева нулями до длины 12
    time_str = time_str.zfill(12)
    hours = int(time_str[0:2])
    minutes = int(time_str[2:4])
    seconds = int(time_str[4:6])
    # Последние 6 символов - это миллисекунды и микросекунды
    milliseconds = int(time_str[6:9])
    microseconds = int(time_str[9:12])
    # Объединим все вместе в объект datetime, устанавливая год, месяц и день на 1
    return datetime(year=1, month=1, day=1, 
                    hour=hours, minute=minutes, second=seconds, 
                    microsecond=(milliseconds * 1000 + microseconds))

def get_date_from_filename(filename):
    date_str = re.search('\d{8}', filename).group(0)
    return datetime.strptime(date_str, '%Y%m%d')

def process_file(filename):
    date = get_date_from_filename(filename)
    df = pd.read_csv(filename)
    df['TIME'] = df['TIME'].apply(convert_time)
    df['TIME'] = df['TIME'].apply(lambda t: t.replace(year=date.year, month=date.month, day=date.day))
    return df

filename = 'MOEX/OrderLog20220104.csv.gz'
df = process_file(filename)

## Preprocess MOEX

In [3]:
import glob
import os
import pandas as pd
from datetime import datetime, timedelta
import re
import numpy as np

# Ваша функция конвертации
def convert_time(time_str):
    # Время могло быть прочитано как int, преобразуем его в строку
    time_str = str(time_str)
    # Время заполнено слева нулями до длины 12
    time_str = time_str.zfill(12)
    hours = int(time_str[0:2])
    minutes = int(time_str[2:4])
    seconds = int(time_str[4:6])
    # Последние 6 символов - это миллисекунды и микросекунды
    milliseconds = int(time_str[6:9])
    microseconds = int(time_str[9:12])
    # Объединим все вместе в объект datetime, устанавливая год, месяц и день на 1
    return datetime(year=1, month=1, day=1, 
                    hour=hours, minute=minutes, second=seconds, 
                    microsecond=(milliseconds * 1000 + microseconds))

def get_date_from_filename(filename):
    date_str = re.search('\d{8}', filename).group(0)
    return datetime.strptime(date_str, '%Y%m%d')

def process_file(filename):
    date = get_date_from_filename(filename)
    df = pd.read_csv(filename)
    df['TIME'] = df['TIME'].apply(convert_time)
    df['TIME'] = df['TIME'].apply(lambda t: t.replace(year=date.year, month=date.month, day=date.day))
    return df
    
# Получаем список всех файлов, которые соответствуют шаблону 'OrderLog*.csv.gz'
files = glob.glob('MOEX/OrderLog*.csv.gz')

# Создаем словарь для хранения датафреймов для каждой акции
df_dict = {}

# Читаем каждый файл и обрабатываем его
for file in files:
    i = 0
    df = process_file(file)
    df['TIME'] = pd.to_datetime(df['TIME'])
    df.set_index('TIME', inplace=True)
    i+=1
    print ('file', i, 'read successfully')

    # Группируем по акции и добавляем к соответствующему датафрейму в df_dict
    for name, group in df.groupby('SECCODE'):
        if name not in df_dict:
            df_dict[name] = pd.DataFrame(columns=['open', 'high', 'low', 'close', 'volume',
                                                  'date', 'time', 'bid_to_order_volume_ratio'])

        group_resampled = group[group['ACTION'] == 2].resample('5min').agg({
            'TRADEPRICE': ['first', 'max', 'min', 'last'],
            'VOLUME': 'sum'
        })
        group_resampled.columns = ['open', 'high', 'low', 'close', 'volume']
        group_resampled['date'] = group_resampled.index.date
        group_resampled['time'] = group_resampled.index.time
        group_resampled = group_resampled[['close', 'date', 'high', 'low', 'open', 'time', 'volume']]

        new_bid_volume = group[group['ACTION'] == 1].resample('5min').VOLUME.sum()
        new_order_volume = group[group['ACTION'] == 2].resample('5min').VOLUME.sum()

        group_resampled['bid_to_order_volume_ratio'] = new_bid_volume / new_order_volume

        group_resampled.replace([np.inf, -np.inf], 0, inplace=True)
        group_resampled.fillna(0, inplace=True)

        df_dict[name] = pd.concat([df_dict[name], group_resampled])

        print (name, 'for file', i, 'read')

# Сохраняем каждый сгруппированный датафрейм в файл
for name, df in df_dict.items():
    df.to_csv(f'{name}.csv')

In [4]:
import os
import pandas as pd

dir_name = './tmp'  

for filename in os.listdir(dir_name):
    if filename.endswith('.csv'): 
        file_path = os.path.join(dir_name, filename)
        
        data = pd.read_csv(file_path)
        data.insert(0, '', range(0, len(data)))
        data = data.rename(columns={'open': 'o', 'high': 'h', 'low': 'l', 'close': 'c', 'volume': 'v'})
        data['time'] = data['time'].dt.strftime('%H:%M')
        data['time'] = data['time'].str[:5]
        data = data[['c', 'date', 'h', 'l', 'o', 'time', 'bid_to_order_volume_ratio', 'v']]
        data = data.replace(0, np.nan)
        data = data.dropna()

        
        data.to_csv(file_path, index=False)

print("CSV files updated successfully.")

CSV files updated successfully.


In [30]:
resampled_df

Unnamed: 0_level_0,close,date,high,low,open,time,volume,bid_to_order_volume_ratio
Converted_TIME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-03-01 19:25:00,345.92,2016-03-01,23100.0,39.760,6592.00,19:25:00,2157998,21.073377
2016-03-01 19:30:00,345.30,2016-03-01,23084.0,40.015,295.49,19:30:00,885680,26.103135
2016-03-01 19:35:00,295.10,2016-03-01,23090.0,39.875,345.43,19:35:00,756622,40.930159
2016-03-01 19:40:00,345.20,2016-03-01,23080.0,39.900,295.02,19:40:00,442488,30.009634
2016-03-01 19:45:00,344.86,2016-03-01,23080.0,39.905,6597.00,19:45:00,377398,47.515480
...,...,...,...,...,...,...,...,...
2016-03-03 17:10:00,0.00,2016-03-03,0.0,0.000,0.00,17:10:00,0,0.000000
2016-03-03 17:15:00,0.00,2016-03-03,0.0,0.000,0.00,17:15:00,0,0.000000
2016-03-03 17:20:00,0.00,2016-03-03,0.0,0.000,0.00,17:20:00,0,0.000000
2016-03-03 17:25:00,0.00,2016-03-03,0.0,0.000,0.00,17:25:00,0,0.000000


In [14]:
r = pd.read_csv('raw_data/8001.csv')

In [15]:
r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91130 entries, 0 to 91129
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  91130 non-null  int64  
 1   c           91130 non-null  float64
 2   date        91130 non-null  object 
 3   h           91130 non-null  float64
 4   l           91130 non-null  float64
 5   o           91130 non-null  float64
 6   time        91130 non-null  object 
 7   v           91130 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 5.6+ MB


In [21]:
m

Unnamed: 0.1,Unnamed: 0,c,date,h,l,o,time,bid_to_order_volume_ratio,v
0,1.0,504.0,2022-01-03,505.0,504.0,504.3,07:10,98.230794,12678.0
1,2.0,504.8,2022-01-03,504.9,504.1,504.2,07:15,480.586420,8262.0
2,3.0,505.0,2022-01-03,505.0,504.6,504.8,07:20,316.891673,5548.0
3,4.0,505.5,2022-01-03,505.6,504.6,505.0,07:25,305.799101,12240.0
4,5.0,505.7,2022-01-03,505.9,505.4,505.5,07:30,44.673729,6844.0
...,...,...,...,...,...,...,...,...,...
3941,3942.0,492.3,2022-02-11,492.9,491.8,492.7,18:20,70.930824,29432.0
3942,3943.0,493.3,2022-02-11,493.5,492.0,492.4,18:25,15.542444,118462.0
3943,3944.0,492.6,2022-02-11,493.6,492.5,493.3,18:30,15.169047,75494.0
3944,3945.0,492.5,2022-02-11,493.1,491.1,492.6,18:35,20.795720,103730.0


In [16]:
m = pd.read_csv('raw_data/TATN.csv')