In [6]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import gc
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

from typing import Union, List, Tuple, Optional, Dict, Any
import cudf
import numba
from numba import cuda

import itertools

from signalslite.data_utils import (
    load_recent_data_from_file,
    save_daily_data,
    save_in_folders,
    get_latest_date,
    read_available_dates,
)
from signalslite.constants import DAILY_DATA_DIR, DATA_DIR, DAILY_PRIMARY_FEATURES_DIR, DAILY_SECONDARY_FEATURES_DIR


In [13]:
if "data" in os.listdir("."):
    print("data folder already exists")
    DAILY_DATA_DIR = DATA_DIR / DAILY_DATA_DIR
    DAILY_DATA_DIR = ".." / DAILY_DATA_DIR
    DAILY_PRIMARY_FEATURES_DIR = ".." / DAILY_PRIMARY_FEATURES_DIR
    DAILY_SECONDARY_FEATURES_DIR = ".." / DAILY_SECONDARY_FEATURES_DIR

data folder already exists


In [4]:
def get_combination_ratio(df: Union[pd.DataFrame, cudf.DataFrame], feature_prefix: str):
    # fetch feature_name and its combination ratio

    feature_cols = [f for f in df.columns if feature_prefix in f]
    feature_pairs = itertools.combinations(feature_cols, 2)

    _feature_list = []
    for _f1, _f2 in feature_pairs:
        _feature_type = _f1.split("_")[2]  # sma, ema, macd, rsi, etc.
        _f1_window_size = _f1.split("_")[-1]  # window size 1
        _f2_window_size = _f2.split("_")[-1]  # window size 2

        _res = 1 - (df[_f2] / df[_f1])
        _res.name = (
            f"feature_2_ratio_{_feature_type}_{_f1_window_size}_{_f2_window_size}"
        )

        _feature_list.append(_res)

        gc.collect()

    # ratio with close price for each feature
    for _f in feature_cols:
        _feature_type = _f.split("_")[2]
        _f_window_size = _f.split("_")[-1]

        _res = 1 - (df["close"] / df[_f])
        _res.name = f"feature_2_ratio_{_feature_type}_{_f_window_size}_close"

        _feature_list.append(_res)

        gc.collect()

    _cated_res = pd.concat(_feature_list, axis=1).astype("float32")

    del _feature_list
    gc.collect()

    return _cated_res

In [5]:
feature_prefixes = [
    "feature_1_sma",
    "feature_1_ema",
    "feature_1_rsi",
]


def calculate_all_secondary_features(
    df: Union[pd.DataFrame, cudf.DataFrame], feature_prefixes: List[str]
):
    _all_features = []
    for feature_prefix in feature_prefixes:
        _features = [f for f in df.columns if feature_prefix in f]
        _res = get_combination_ratio(df.loc[:, _features + ["close"]], feature_prefix)
        _all_features.append(_res)
        gc.collect()

    _all_features = pd.concat(_all_features, axis=1).astype("float32")
    return _all_features


In [17]:
if os.path.exists(DAILY_SECONDARY_FEATURES_DIR):
    primary_data_dates = read_available_dates(DAILY_PRIMARY_FEATURES_DIR)
    print(f"primary_dates: {len(primary_data_dates)}")
    secondary_data_dates = read_available_dates(DAILY_SECONDARY_FEATURES_DIR)
    print(f"secondary_features_dates: {len(secondary_data_dates)}")

    n_days_to_load = len(primary_data_dates) - len(secondary_data_dates) + 10

    print(f"n_days_to_load: {n_days_to_load}")

primary_dates: 6123
secondary_features_dates: 6117
n_days_to_load: 16


In [22]:
def load_process_save():
    feature_prefixes = [
        "feature_1_sma",
        "feature_1_ema",
        "feature_1_rsi",
    ]

    dates = []
    start_index = 0

    if os.path.exists(DAILY_SECONDARY_FEATURES_DIR):
        primary_data_dates = read_available_dates(DAILY_PRIMARY_FEATURES_DIR)
        print(f"primary_dates: {len(primary_data_dates)}")
        secondary_data_dates = read_available_dates(DAILY_SECONDARY_FEATURES_DIR)
        print(f"secondary_features_dates: {len(secondary_data_dates)}")

        n_days_to_load = len(primary_data_dates) - len(secondary_data_dates) + 10

        print(f"n_days_to_load: {n_days_to_load}")

        dates = primary_data_dates
        start_index = len(secondary_data_dates) - 1000

    # iterate over all dates in chunks of 200
    for i in tqdm(range(start_index, len(dates), 1000)):
        print(i)
        _df = load_recent_data_from_file(
            DAILY_PRIMARY_FEATURES_DIR,
            n_days=1000,
            ascending=True,
            offset=i,
            dtype="float32",
        )

        feat_cols = [f for f in _df if "feature_" in f]

        _res = calculate_all_secondary_features(_df, feature_prefixes).astype("float32")

        # combine primary and secondary featuers
        _res = pd.concat([_df, _res], axis=1)
        _res = _res.replace([np.inf, -np.inf], np.nan)
        _res = _res.dropna(axis=0)

        assert (
            _res.isna().mean().sort_values(ascending=False).max() < 0.1
        ), "too many NaN values found"
        save_in_folders(_res, DAILY_SECONDARY_FEATURES_DIR)

        # del _df, _res
        gc.collect()


load_process_save()


primary_dates: 6123
secondary_features_dates: 6123
n_days_to_load: 10


  0%|          | 0/1 [00:00<?, ?it/s]

5123


100%|██████████| 1000/1000 [00:01<00:00, 556.29it/s]
100%|██████████| 1/1 [00:23<00:00, 23.02s/it]
