In [2]:
import os
import json

import matplotlib.pyplot as plt
import pandas as pd
import h5py


In [3]:
def list_files_in_folder(folder_path):
    file_list = []
    for file in os.listdir(folder_path):
        if ".csv" in file:
            file_list.append(os.path.join(folder_path, file))
    return file_list

# Function to convert date string to nanosecond epoch time
def date_to_nanoseconds(date_str):
    dt = pd.to_datetime(date_str)
    return int(dt.timestamp() * 1e9)

def parse_meta_data(meta_data):
    # Loop through each entry in the mappings and convert the dates
    for key, entries in meta_data["symbology"]['mappings'].items():
        for entry in entries:
            entry['d0'] = date_to_nanoseconds(entry['d0'])
            entry['d1'] = date_to_nanoseconds(entry['d1'])

global path
path = r"C:\Users\natha\OneDrive\Desktop\Data\DataBento\TechTest"

with open(os.path.join(path,"metadata.json"), "r") as file:
    meta_data = json.load(file)
    parse_meta_data(meta_data)

files = list_files_in_folder(path)
df = pd.concat([pd.read_csv(file) for file in files])
df

Unnamed: 0,ts_event,publisher_id,instrument_id,open,high,low,close,volume
0,1686729600000000000,2,6857,334790000000,334790000000,334230000000,334520000000,232
1,1686729600000000000,2,2376,50960000000,50960000000,50760000000,50760000000,27
2,1686729600000000000,2,7353,413200000000,414000000000,412710000000,413710000000,982
3,1686729600000000000,2,471,127460000000,128110000000,125900000000,128110000000,6682
4,1686729600000000000,2,5416,34020000000,34020000000,34000000000,34000000000,1444
...,...,...,...,...,...,...,...,...
4038,1689292620000000000,2,7322,463670000000,463850000000,463640000000,463710000000,713
4039,1689292680000000000,2,7322,463770000000,464000000000,463770000000,463990000000,1923
4040,1689292740000000000,2,6827,342680000000,342680000000,342680000000,342680000000,4
4041,1689292740000000000,2,7322,464000000000,464330000000,464000000000,464330000000,947


In [4]:
df["ticker"] = ""
mappings = meta_data["symbology"]["mappings"]
for ticker in list(mappings.keys()):
    for range in mappings[ticker]:
        df.loc[
            (df['ts_event'] >= range["d0"]) &
            (df['ts_event'] <= range["d1"]) &
            (df["instrument_id"] == int(range["s"])),
            "ticker"
        ] = ticker


In [9]:
def save_data(ext : str) -> None:
    if ext == ".h5":
        _path = os.path.join(path, "hdf5", "data.h5")
        if os.path.exists(_path):
            os.remove(_path)

    for ticker in list(mappings.keys()):
        df_mid = df[df["ticker"] == ticker].copy()
        df_mid = df_mid[["ts_event","open","high","low","close","volume"]]
        df_mid[["open","high","low","close"]] /= 1e9
        
        if ext == ".parquet":
            df_mid.to_parquet(
                os.path.join(path, "parquet", ticker + ".parquet"),
                index=False
            )
        elif ext == ".csv":
            df_mid['ts_event'] = pd.to_datetime(df_mid['ts_event'])
            df_mid['ts_event'] = df_mid['ts_event'].dt.strftime('%Y-%m-%d %H:%M:%S')
            df_mid.to_csv(
                os.path.join(path, "csv", ticker + ".csv"),
                index=False
            )
        elif ext == ".h5":
            _path = os.path.join(path, "hdf5", "data.h5")
            with h5py.File(_path, "a") as file:
                # Convert the DataFrame to a NumPy array
                cols = ["open","high","low","close","volume"]
                data = df_mid[cols].to_numpy()

                # Create a new dataset and save the data
                file.create_dataset(f"{ticker}/datetime", data=df_mid["ts_event"].to_numpy())
                dataset = file.create_dataset(f"{ticker}/data", data=data)

                # Store column names as attributes
                for col_name in cols:
                    dataset.attrs[col_name] = col_name
            
            #with pd.HDFStore(_path, mode='a') as store:
            #    store.put(ticker, df_mid, format='table', data_columns=True, index=False, append=False)
        else:
            raise RuntimeError("invalid ext")
        
        print(ticker, df_mid["ts_event"].values[0], df_mid["ts_event"].values[-1], len(df_mid))

save_data(".h5")

AMD 1686729600000000000 1689292740000000000 15439
NVDA 1686729600000000000 1689292740000000000 16898
MSFT 1686729600000000000 1689292740000000000 15204
ORCL 1686729660000000000 1689291120000000000 10009
INTC 1686729600000000000 1689292500000000000 12065
CSCO 1686729600000000000 1689291840000000000 8837


In [10]:
_path = os.path.join(path, "hdf5", "data.h5")
def print_datasets(name, obj):
    if isinstance(obj, h5py.Dataset):
        print("Dataset Name:", name)

with h5py.File(_path, 'r') as h5_file:
    h5_file.visititems(print_datasets)

Dataset Name: AMD/data
Dataset Name: AMD/datetime
Dataset Name: CSCO/data
Dataset Name: CSCO/datetime
Dataset Name: INTC/data
Dataset Name: INTC/datetime
Dataset Name: MSFT/data
Dataset Name: MSFT/datetime
Dataset Name: NVDA/data
Dataset Name: NVDA/datetime
Dataset Name: ORCL/data
Dataset Name: ORCL/datetime


In [21]:
x = df[df["ts_event"] >= 1686729600000000000]
x[["open","high","low","close"]] /= 1e9
x = x[x["ticker"] == "MSFT"]
x

Unnamed: 0,ts_event,publisher_id,instrument_id,open,high,low,close,volume,ticker
0,1686729600000000000,2,6857,334.79,334.79,334.23,334.52,232,MSFT
6,1686729660000000000,2,6857,334.69,334.69,334.44,334.44,127,MSFT
9,1686729720000000000,2,6857,334.48,334.48,334.48,334.48,168,MSFT
13,1686729780000000000,2,6857,334.29,334.48,334.29,334.42,122,MSFT
18,1686729840000000000,2,6857,334.29,334.29,334.29,334.29,5,MSFT
...,...,...,...,...,...,...,...,...,...
3999,1689291660000000000,2,6827,342.85,342.87,342.80,342.80,319,MSFT
4017,1689292020000000000,2,6827,342.66,342.66,342.60,342.60,106,MSFT
4018,1689292080000000000,2,6827,342.50,342.50,342.31,342.31,21,MSFT
4028,1689292320000000000,2,6827,342.50,342.50,342.12,342.12,290,MSFT


In [25]:
v = x[["open","high","low","close"]].values
indices = np.where(v == 1)

AttributeError: 'numpy.ndarray' object has no attribute 'argwhere'