In [1]:
# Library
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from collections import defaultdict
from datetime import datetime, timezone, timedelta
import os
import gc
import re
import h5py
import exchange_calendars as xcals
import random

import torch
from torch.utils.data import Dataset, get_worker_info

ST, ED = datetime(2015,1,1), datetime(2024,12,15)
PATH = "e:/Financial_Data/"
H5PATH = PATH + 'data.h5'
SCHEDULE = xcals.get_calendar('XNAS').schedule.loc[ST:ED][['open', 'close']].map(lambda x: x.tz_localize(None) - timedelta(hours=5))
THZ = ['1m', '5m', '30m', '1d', '1w']

In [6]:
SCHEDULE['close'].apply(lambda x: x.time()).unique()

array([datetime.time(16, 0), datetime.time(15, 0), datetime.time(13, 0),
       datetime.time(12, 0)], dtype=object)

In [None]:
# pd.read_hdf(H5PATH, key=f'chart/delisted_1m/')
# with pd.HDFStore(H5PATH, mode='r') as store:
# with h5py.File(H5PATH, 'r') as store:

In [2]:
with h5py.File(H5PATH, 'r') as store:
    l = [list(store['chart/delisted_1m/'].keys()), list(store['chart/inlisted_1m/'].keys())]

In [None]:
class StockDatasetHDF5(Dataset):
    def __init__(self, min_peroid:timedelta, min_peroid_label:timedelta,
        ticker_list=None, date_range=None,
        transform=None, target_transform=None
    ):
        with h5py.File(H5PATH, 'r') as store:
            self._tickers = [list(store['chart/inlisted_1m/'].keys()), list(store['chart/delisted_1m/'].keys())]
        self._h5ins = pd.HDFStore(H5PATH, mode='r')
        
        self.ticker_list = ticker_list if ticker_list else self._tickers
        self.date_range = date_range if date_range else [ST, ED]
        self.min_peroid = min_peroid
        self.min_peroid_label = min_peroid_label

    def __getitem__(self, index):
        # wkinfo = get_worker_info()
        # 일단 worker=0 가정하고 ㄱㄱ
        dfs = dict()
        if (isinlisted := (index >= len(self.ticker_list))): index -= len(self.ticker_list)
        for hz in THZ:
            dfs[hz] = self._h5ins.get(f'chart/{'in' if isinlisted else 'de'}listed_{hz}/{self.ticker_list[isinlisted][index]}')
        st, ed = dfs['1m'].index[0] + self.min_peroid, dfs['1m'].index[-1] - self.min_peroid_label
        if st < self.date_range[0]: st = self.date_range[0]
        if ed > self.date_range[1]: ed = self.date_range[1]
        for hz in THZ[:3]:
            dfs[hz] = dfs[hz].loc[st:ed]
        st = st.replace(hour=0, minute=0, second=0)
        dfs['1d'] = dfs[hz].loc[st:ed]
        st = st - timedelta(days=st.isoweekday() - 1)
        dfs['1W'] = dfs[hz].loc[st:ed]
        
        return dfs
    
    def close(self):
        self._h5ins.close()

#### 이전 코드들

In [2]:
f = h5py.File(H5PATH, 'r')
tickers = list(f['chart/delisted_1d/'].keys())
f.close()

In [2]:
sample_index = []
for r in SCHEDULE.iterrows():
    start_time, end_time = r[1]['open'], r[1]['close']
    sample_index += [start_time + timedelta(minutes=i) for i in range(int((end_time - start_time).total_seconds() / 60))]
temp = pd.Index(sample_index, name='timestamp')

In [7]:
# pre, after market 빼기
for ticker in tqdm(tickers):
    df = pd.read_hdf(H5PATH, key=f'chart/delisted_1m/{ticker}')
    df.index = pd.to_datetime(df.index - 3600*5, unit="s")
    filtered_df = df.loc[df.index.intersection(temp)]
    
    with pd.HDFStore(H5PATH, mode='a', complevel=5, complib='zlib') as store:
        store.put(f"chart/delisted_1m/{ticker}", filtered_df, format='table')
        

100%|██████████| 7852/7852 [1:31:54<00:00,  1.42it/s]  


In [67]:
# 5분, 30분 agg
for ticker in tqdm(tickers):
    df = pd.read_hdf(H5PATH, key=f'chart/inlisted_1m/{ticker}')
    l = {}; unit_list = [5, 30]
    for unit in unit_list:
        l[unit] = df.resample(f"{unit}min").agg({
            "open": "first",
            "high": "max",
            "low": "min",
            "close": "last",
            'volume': 'sum',
            'vwap': 'mean',
            'transactions': 'sum',
        }).dropna(axis=0)
    
    with pd.HDFStore(H5PATH, mode='a', complevel=5, complib='zlib') as store:
        for unit in unit_list:
            store.put(f"chart/inlisted_{unit}m/{ticker}", l[unit], format='table')

100%|██████████| 8920/8920 [1:42:05<00:00,  1.46it/s]  


In [4]:
# week agg, datetime 인덱스로 변경
for ticker in tqdm(tickers):
    df = pd.read_hdf(H5PATH, key=f'chart/delisted_1d/{ticker}')
    df.index = pd.to_datetime(df.index - 3600*5, unit="s")
    df_filtered = df.resample("1W-Mon", label='left', closed='left').agg({
        "open": "first",
        "high": "max",
        "low": "min",
        "close": "last",
        'volume': 'sum',
        'vwap': 'mean',
        'transactions': 'sum',
    }).dropna(axis=0)
    
    with pd.HDFStore(H5PATH, mode='a', complevel=5, complib='zlib') as store:
        store.put(f"chart/delisted_1w/{ticker}", df_filtered, format='table')
        store.put(f"chart/delisted_1d/{ticker}", df, format='table')

100%|██████████| 7854/7854 [37:12<00:00,  3.52it/s]


In [3]:
# 특정 키 삭제
with h5py.File(H5PATH, "a") as hdf:
    for k in ['delisted_1d', 'delisted_1m', 'delisted_30m', 'delisted_5m']:
        key_to_delete = 'chart/'+k+'/MURAV'  # 삭제할 키 이름

        # 키가 존재하는지 확인 후 삭제
        if key_to_delete in hdf:
            print(k, 'deleted')
            del hdf[key_to_delete]

delisted_1m deleted
