In [6]:
import pandas as pd
import os
from pathlib import Path
import gc
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

import cudf
import numba
from numba import cuda

from signalslite.data_utils import load_recent_data_from_file, save_in_folders, get_latest_date, read_available_dates
from signalslite.constants import Directories

In [7]:
dir_config = Directories()

In [8]:
from signalslite.technical_features import (
    simple_moving_average,
    exponential_moving_average,
    bollinger_bands,
    rsi,
    macd,
    average_true_range,
)

In [None]:
DAILY_DATA_DIR = ".." / DAILY_DATA_DIR
DAILY_PRIMARY_FEATURES_DIR = ".." / DAILY_PRIMARY_FEATURES_DIR

In [4]:
n_days_to_load = -1

# if some of secondary features in days are there then take last 1000 days in adjusted data: 1000
# else take all days in adjusted data: -1
if os.path.exists(DAILY_PRIMARY_FEATURES_DIR):
    dates = read_available_dates(DAILY_PRIMARY_FEATURES_DIR)
    if len(dates) > 0:
        n_days_to_load = 1000

print(f"n_days_to_load: {n_days_to_load}")    

NameError: name 'DAILY_PRIMARY_FEATURES_DIR' is not defined

In [6]:
recent_data = load_recent_data_from_file(DAILY_DATA_DIR, n_days=n_days_to_load, ascending=False).reset_index().sort_values(
    by=["bloomberg_ticker", "date"]
)

recent_data[
    ["open", "close", "high", "low", "adjusted_close", "dividend_amount", "split_ratio"]
] = recent_data[
    ["open", "close", "high", "low", "adjusted_close", "dividend_amount", "split_ratio"]
].astype(
    "float16"
)

# filter out tickers with less than 100 days of data
recent_data = recent_data.groupby("bloomberg_ticker").filter(lambda x: len(x) > 100)
recent_data = recent_data.groupby("date").filter(lambda x: len(x) > 500)
gc.collect()

recent_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6557330 entries, 6566927 to 4088
Data columns (total 13 columns):
 #   Column            Dtype         
---  ------            -----         
 0   date              datetime64[ns]
 1   open              float16       
 2   high              float16       
 3   low               float16       
 4   close             float16       
 5   adjusted_close    float16       
 6   volume            float64       
 7   data_provider     object        
 8   bloomberg_ticker  object        
 9   dividend_amount   float16       
 10  split_ratio       float16       
 11  date_str          object        
 12  split_factor      object        
dtypes: datetime64[ns](1), float16(7), float64(1), object(4)
memory usage: 437.7+ MB


In [7]:


function_to_window: dict = {
    simple_moving_average: [5, 10, 20, 50, 100, 200],
    exponential_moving_average: [5, 10, 20, 50, 100, 200],
    bollinger_bands: [5, 10, 20, 50, 100, 200],
    rsi: [5, 10, 20, 50, 100, 200],
    average_true_range: [5, 10, 20, 50, 100, 200],
    macd: [(12, 26), (20, 50)],
}


def compute_features(df):
    features = []
    for func, windows in function_to_window.items():
        for window in windows:
            # pass windows as a tuple if the function takes more than one window
            if isinstance(window, tuple):
                _feat = func(df, *window)
            else:
                _feat = func(df, window)

            if isinstance(_feat, tuple):
                features.extend(_feat)
            else:
                features.append(_feat)

    # print type of features
    cated = cudf.concat(features, axis=1).astype("float32").add_prefix("feature_1_")
    return cated


In [8]:
tickers_list = recent_data["bloomberg_ticker"].unique().tolist()

# iterate over ticker chunks in 500
res = []
for i in tqdm(range(0, len(tickers_list), 1000)):
    tickers = tickers_list[i : i + 1000]
    # print(tickers)
    tickers_data = recent_data[recent_data["bloomberg_ticker"].isin(tickers)]

    _df_gpu = cudf.from_pandas(tickers_data)
    _res = compute_features(_df_gpu)
    _res = _res.to_pandas().astype("float16")
    _res["date"] = _df_gpu["date"].to_pandas()
    _res["bloomberg_ticker"] = _df_gpu["bloomberg_ticker"].to_pandas()
    _res["close"] = _df_gpu["close"].to_pandas()
    _res["volume"] = _df_gpu["volume"].to_pandas()
    _res["open"] = _df_gpu["open"].to_pandas()
    _res["high"] = _df_gpu["high"].to_pandas()
    _res["low"] = _df_gpu["low"].to_pandas()

    res.append(_res)

    del _df_gpu, _res


100%|██████████| 9/9 [00:14<00:00,  1.61s/it]


In [9]:
gc.collect()
res = pd.concat(res, axis=0)
res = res.dropna(axis=0)

# convert float 16 to float 32 in a loop
for col in res.columns:
    if res[col].dtype == "float16":
        res[col] = res[col].astype("float32")
    gc.collect()

gc.collect()

del recent_data
gc.collect()

0

In [10]:
# loop over all unique dates in chunks of 100; save each chunk to a separate file
res["date_str"] = res["date"].dt.strftime("%Y-%m-%d")
dates = res["date_str"].unique()

for i in tqdm(range(0, len(dates), 100)):
    # use save_in_folders function to save each chunk to a separate folder
    _tmp = res[res["date_str"].isin(dates[i : i + 100])]
    save_in_folders(_tmp, DAILY_PRIMARY_FEATURES_DIR)

    del _tmp
    gc.collect()

100%|██████████| 100/100 [00:01<00:00, 55.55it/s]
100%|██████████| 100/100 [00:00<00:00, 381.64it/s]
100%|██████████| 100/100 [00:00<00:00, 395.25it/s]
100%|██████████| 100/100 [00:00<00:00, 365.26it/s]
100%|██████████| 100/100 [00:00<00:00, 349.17it/s]
100%|██████████| 100/100 [00:00<00:00, 393.26it/s]
100%|██████████| 100/100 [00:00<00:00, 382.34it/s]
100%|██████████| 100/100 [00:00<00:00, 394.20it/s]
100%|██████████| 40/40 [00:00<00:00, 900.74it/s]
100%|██████████| 9/9 [00:11<00:00,  1.23s/it]


In [20]:
_tmp = pd.read_parquet("/mnt/d/nmr/signalslite/data/01_daily_adjusted/2005/03/2005-03-01.parquet")