In [1]:
# Library
import sys, os
sys.path.append(os.path.abspath('..'))

from hdf5_loader import StockDatasetHDF5
from myconfig import *
import subclass as sc

import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
from collections import defaultdict, OrderedDict
from datetime import datetime, timedelta
import os, shutil, wandb
from itertools import permutations

import torch
import torch.nn as nn
from torch.utils.data import IterableDataset, DataLoader
import torch.nn.functional as F
import torchsummary

# np.set_printoptions(precision=4, suppress=True, linewidth=120)
torch.set_printoptions(sci_mode=False, precision=4)
# _ = plt.tight_layout()

In [2]:
import models.encdec as encdec
import models.mybuffer as buf

ticker_list=[
    'AAPL',
    'MSFT',
    'GOOGL',
    'META',
    'IBM',
    'INTC',
]
date_range=[ST, ED]
hz_dim = {hz:128 for hz in THZ}
targ_hz = '5m'
label_weight = {hz:v for hz, v in zip(THZ, [0.1, 0.3, 0.5, 0.1, 0])}
batch_size = 3

In [3]:
import importlib
_ = importlib.reload(encdec)
_ = importlib.reload(sc)
_ = importlib.reload(buf)

In [67]:
hdf5_inst = sc.StockDatasetHDF5(ticker_list, date_range)
envgen = sc.get_samples(hdf5_inst, hz_dim, targ_hz, tensor=True)

In [68]:
d = hdf5_inst[0]

In [69]:
d['1m'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 973474 entries, 2015-01-02 09:30:00 to 2024-12-13 15:59:00
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   open          973474 non-null  float32
 1   high          973474 non-null  float32
 2   low           973474 non-null  float32
 3   close         973474 non-null  float32
 4   volume        973474 non-null  float32
 5   vwap          973474 non-null  float32
 6   transactions  973474 non-null  float32
dtypes: float32(7)
memory usage: 33.4 MB


In [70]:
npdf = dict()
for hz in THZ:
    df = d[hz]
    df['timestamp'] = df.index.astype('int64') // 10**9    

    padding = np.zeros(shape=(hz_dim[hz], df.shape[1]))
    npdf[hz] = np.concatenate([padding, df.to_numpy()], axis=0)

In [49]:
d['5m'][:5].index

DatetimeIndex(['2015-01-02 09:30:00', '2015-01-02 09:35:00',
               '2015-01-02 09:40:00', '2015-01-02 09:45:00',
               '2015-01-02 09:50:00'],
              dtype='datetime64[ns]', name='timestamp', freq=None)

In [64]:
# make sample
il = {hz:[0] for hz in THZ+['T']}
flag = False
for i in range(hz_dim[targ_hz], len(npdf[targ_hz])):
    targ_time = npdf[targ_hz][i, 7]
    il['T'].append(targ_time)
    # indexing
    for hz in THZ:
        dt = 0
        while True:
            if npdf[hz][(v := il[hz][-1] + dt), 7] + UNIT_TS[hz] > targ_time:
                il[hz].append(v)
                break
            else: dt += 1
    i += 1
minlen = min([len(il[hz]) for hz in THZ])
for hz in THZ+['T']:
    il[hz] = np.array(il[hz])[1:minlen]

In [48]:
npdf['1m'][126:129,7]

array([0.000000e+00, 0.000000e+00, 1.420191e+09])

In [9]:
def extract_windows(data, index_list, window_size=128):
    offsets = np.arange(window_size)
    indices = index_list[:, None] - window_size + offsets
    return data[indices, :]

res = dict()
rand_i = np.random.randint(max(hz_dim.values()), len(il[targ_hz]), size=batch_size)
for hz in THZ:
    res[hz] = extract_windows(npdf[hz], il[hz][rand_i])

In [10]:
for hz in THZ:
    print(res[hz].shape)

(3, 128, 8)
(3, 128, 8)
(3, 128, 8)
(3, 128, 8)
(3, 128, 8)


In [11]:
for hz in THZ:
    print(il[hz][rand_i])

[231408 648822 245148]
[ 46384 129870  49132]
[ 7837 21753  8295]
[ 722 1797  758]
[251 474 259]


In [12]:
res['1m'].shape

(3, 128, 8)