In [2]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import gc
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

from typing import Union, List, Tuple, Optional, Dict, Any
import cudf
import numba
from numba import cuda

import itertools

from src.data_utils import (
    load_recent_data_from_file,
    save_daily_data,
    save_in_folders,
    get_latest_date,
    read_available_dates,
)
from src.config import DAILY_DATA_DIR, DATA_DIR, DAILY_PRIMARY_FEATURES_DIR


In [3]:
def get_combination_ratio(df: Union[pd.DataFrame, cudf.DataFrame], feature_prefix: str):
    # fetch feature_name and its combination ratio

    feature_cols = [f for f in df.columns if feature_prefix in f]
    feature_pairs = itertools.combinations(feature_cols, 2)

    _feature_list = []
    for _f1, _f2 in feature_pairs:
        _feature_type = _f1.split("_")[2]  # sma, ema, macd, rsi, etc.
        _f1_window_size = _f1.split("_")[-1]  # window size 1
        _f2_window_size = _f2.split("_")[-1]  # window size 2

        _res = 1 - (df[_f2] / df[_f1])
        _res.name = (
            f"feature_2_ratio_{_feature_type}_{_f1_window_size}_{_f2_window_size}"
        )

        _feature_list.append(_res)

        gc.collect()

    # ratio with close price for each feature
    for _f in feature_cols:
        _feature_type = _f.split("_")[2]
        _f_window_size = _f.split("_")[-1]

        _res = 1 - (df["close"] / df[_f])
        _res.name = f"feature_2_ratio_{_feature_type}_{_f_window_size}_close"

        _feature_list.append(_res)

        gc.collect()

    _cated_res = pd.concat(_feature_list, axis=1).astype("float32")

    del _feature_list
    gc.collect()

    return _cated_res

In [4]:
feature_prefixes = [
    "feature_1_sma",
    "feature_1_ema",
    "feature_1_rsi",
]

def calculate_all_secondary_features(
    df: Union[pd.DataFrame, cudf.DataFrame], feature_prefixes: List[str]
):
    _all_features = []
    for feature_prefix in feature_prefixes:
        _features = [f for f in df.columns if feature_prefix in f]
        _res = get_combination_ratio(df.loc[:, _features + ["close"]], feature_prefix)
        _all_features.append(_res)
        gc.collect()

    _all_features = pd.concat(_all_features, axis=1).astype("float32")
    return _all_features


In [5]:
FROM_SCRATCH = True

In [6]:
dates = read_available_dates(DAILY_PRIMARY_FEATURES_DIR)

start_index = len(dates) - 1000 if not FROM_SCRATCH else 0

# iterate over all dates in chunks of 200
for i in tqdm(range(start_index, len(dates), 1000)):
    print(i)
    _df = load_recent_data_from_file(
        DAILY_PRIMARY_FEATURES_DIR,
        n_days=1000,
        ascending=True,
        offset=i,
        dtype="float32",
    )

    feat_cols = [f for f in _df if "feature_" in f]

    _res = calculate_all_secondary_features(_df, feature_prefixes).astype("float32")

    # combine primary and secondary featuers
    _res = pd.concat([_df, _res], axis=1)
    _res = _res.replace([np.inf, -np.inf], np.nan)
    _res = _res.dropna(axis=0)

    assert _res.isna().mean().sort_values(ascending=False).max() < 0.1, "too many NaN values found"
    save_in_folders(_res, os.path.join(DATA_DIR, "03_secondary_features"))

    #del _df, _res
    gc.collect()


  0%|          | 0/7 [00:00<?, ?it/s]

0


100%|██████████| 1000/1000 [00:41<00:00, 23.87it/s]
 14%|█▍        | 1/7 [01:12<07:15, 72.56s/it]

1000


100%|██████████| 1000/1000 [00:36<00:00, 27.10it/s]
 29%|██▊       | 2/7 [02:24<06:00, 72.07s/it]

2000


100%|██████████| 1000/1000 [00:39<00:00, 25.41it/s]
 43%|████▎     | 3/7 [03:41<04:56, 74.22s/it]

3000


100%|██████████| 1000/1000 [00:38<00:00, 25.95it/s]
 57%|█████▋    | 4/7 [05:03<03:51, 77.31s/it]

4000


100%|██████████| 1000/1000 [00:38<00:00, 26.29it/s]
 71%|███████▏  | 5/7 [06:25<02:38, 79.15s/it]

5000


100%|██████████| 1000/1000 [00:38<00:00, 26.18it/s]
 86%|████████▌ | 6/7 [07:47<01:20, 80.26s/it]

6000


100%|██████████| 117/117 [00:03<00:00, 31.89it/s]
100%|██████████| 7/7 [08:02<00:00, 68.92s/it]
