In [1]:
import pandas as pd
import os
from pathlib import Path
import gc
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

import cudf
import numba
from numba import cuda
import numpy as np
import numerapi

# parallelize the process on all columns using joblib
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

from signals_lite.data.data_utils import (
    load_recent_data_from_file,
    save_daily_data,
    save_in_folders,
    get_latest_date,
    read_available_dates
)
from signals_lite.constants import (
    DAILY_DATA_DIR,
    DATA_DIR,
    DAILY_PRIMARY_FEATURES_DIR,
    DAILY_SECONDARY_FEATURES_DIR,
    DAILY_SCALED_FEATURES_DIR,
)


In [2]:
def update_historical_file():
    napi = numerapi.SignalsAPI(verbosity="info")
    # get the latest date
    pd.read_csv(napi.HISTORICAL_DATA_URL).to_csv(
        DATA_DIR / "numerai_signals_historical.csv", index=False
    )

# update_historical_file()

In [3]:
historical_df = pd.read_csv(DATA_DIR / "numerai_signals_historical.csv")
historical_df["date"] = pd.to_datetime(historical_df["friday_date"], format="%Y%m%d")

In [None]:
dates = read_available_dates(DAILY_SCALED_FEATURES_DIR)

merged_df = []

for i in tqdm(range(0, len(dates), 200)):
    _tmp = load_recent_data_from_file(
        DAILY_SCALED_FEATURES_DIR, n_days=200, ascending=True, offset=i
    )
    _tmp = _tmp.reset_index(drop=True)
    _tmp = _tmp.sort_values(["date", "bloomberg_ticker"])
    feature_columns = [f for f in _tmp.columns if f.startswith("feature")]

    _historical_dates = historical_df["date"].unique()
    # find common dates
    common_dates = list(set(_tmp["date"].unique()).intersection(set(_historical_dates)))

    if len(common_dates) == 0:
        continue

    # merge historical_df and _tmp on date and bloomberg_ticker
    _merged = pd.merge(
        historical_df,
        _tmp,
        how="right",
        left_on=["date", "bloomberg_ticker"],
        right_on=["date", "bloomberg_ticker"],
    )
    _merged = _merged.dropna(subset=["target_20d"], axis=0)

    #assert abs(_merged["feature_2_ratio_rsi_50_close"].mean() - 2) < 0.05

    if len(_merged) > 0:
        merged_df.append(_merged)

merged_df = pd.concat(merged_df)


In [5]:
_merged

Unnamed: 0,bloomberg_ticker,friday_date,data_type,target_4d,target_20d,target_20d_raw_return,target_20d_factor_neutral,target_20d_factor_feat_neutral,date,feature_1_sma_5,...,feature_2_ratio_rsi_50_100,feature_2_ratio_rsi_50_200,feature_2_ratio_rsi_100_200,feature_2_ratio_rsi_5_close,feature_2_ratio_rsi_10_close,feature_2_ratio_rsi_20_close,feature_2_ratio_rsi_50_close,feature_2_ratio_rsi_100_close,feature_2_ratio_rsi_200_close,date_str
1000743,1332 JP,20030131.0,train,0.75,0.50,0.75,0.50,0.5,2003-01-31,4,...,3,3,2,0,0,0,0,0,0,2003-01-31
1000745,1801 JP,20030131.0,train,0.50,0.75,0.75,0.50,0.5,2003-01-31,4,...,4,4,1,0,0,0,0,0,0,2003-01-31
1000746,1802 JP,20030131.0,train,0.25,0.50,0.75,0.75,0.5,2003-01-31,4,...,4,4,2,0,0,0,0,0,0,2003-01-31
1000747,1803 JP,20030131.0,train,0.50,0.50,0.50,0.50,0.5,2003-01-31,4,...,1,1,2,0,0,0,0,0,0,2003-01-31
1000749,1812 JP,20030131.0,train,0.25,0.50,0.50,0.50,0.5,2003-01-31,4,...,0,0,0,0,0,0,0,0,0,2003-01-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005180,ZC FP,20030131.0,train,0.50,0.50,0.25,0.25,0.5,2003-01-31,1,...,1,1,2,4,3,3,3,3,3,2003-01-31
1005187,ZLC US,20030131.0,train,0.50,0.50,0.50,0.50,0.5,2003-01-31,3,...,2,3,3,1,1,1,1,1,1,2003-01-31
1005189,ZOLL US,20030131.0,train,0.25,0.50,0.50,0.50,0.5,2003-01-31,3,...,3,3,4,2,2,2,1,1,1,2003-01-31
1005193,ZQKSQ US,20030131.0,train,0.50,0.50,0.50,0.50,0.5,2003-01-31,1,...,2,2,2,2,3,3,3,3,3,2003-01-31


In [6]:
def prepare_live():
    latest_date = get_latest_date(DAILY_SCALED_FEATURES_DIR)
    print("latest date: ", latest_date)
    # load the latest data
    df = load_recent_data_from_file(
        DAILY_SCALED_FEATURES_DIR, n_days=1, ascending=False
    )
    df = df.reset_index(drop=True)
    print("df shape: ", df.shape)

    return df

live_df = prepare_live()

latest date:  2023-07-07
df shape:  (7077, 104)


In [7]:
merged_df.to_parquet(DATA_DIR / "merged_data_historical.parquet", index=False)
live_df.to_parquet(DATA_DIR / "merged_data_live.parquet", index=False)