In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

import os

save_path = "./result/xgboost"
model_path = "./data/model_data/models/xgboost"
os.makedirs(save_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

In [2]:
feature_path = "./data/model_data/period_features"

def load_feature(num_period: int, feature_path: str = feature_path) -> pd.DataFrame:
    data = pd.read_parquet(f"{feature_path}/features_period_{num_period}.parquet")

    date_range = data["Date"].sort_values().unique()
    data = data.set_index("Date")

    train_data = data.loc[date_range[:-250]]
    test_data = data.loc[date_range[-250:]]

    return train_data.reset_index(), test_data.reset_index()

In [4]:
performance_stats = {}
feature_importance = {}

In [5]:
def xgboost_single_period(period):
    train_data, test_data = load_feature(period)
    X_train = train_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_train = train_data["Target"]
    X_test = test_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_test = test_data["Target"]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 5,
        'learning_rate': 0.1,
        'lambda': 0.1, # L2 regularization term on weights
        'subsample': 0.75,
        'colsample_bytree': 0.75,
        'min_child_weight': 2,
        'nthread': 8, # Number of threads
    }

    wachlist = [(dtrain, 'train'), (dtest, 'test')]
    num_round = 1000
    model = xgb.train(params, dtrain, num_round, evals=wachlist, early_stopping_rounds=10, verbose_eval=False)

    y_pred_proba = model.predict(dtest)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)

    result_df = test_data[["Date", "Ticker", "Return_tomorrow", "Target"]].copy()
    result_df["pred"] = y_pred
    result_df["pred_proba"] = y_pred_proba

    performance_stats[period] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred),
        "gini_index": 2 * roc_auc_score(y_test, y_pred) - 1,
    }
    feature_importance[period] = model.get_fscore()
    
    plt.figure(figsize=(12, 6))
    xgb.plot_importance(model, ax=plt.gca(), title=f"XGBoost Feature Importance for Period {period}")
    plt.savefig(f"{save_path}/xgboost_feature_importance_{period}.png")
    plt.close()

    result_df.to_parquet(f"{save_path}/xgboost_period_{period}.parquet")
    model.save_model(f"{model_path}/xgboost_period_{period}.json")
    return model

In [6]:
model = xgboost_single_period(1)
performance_stats, feature_importance[1]

({1: {'accuracy': 0.5180633020835099,
   'precision': 0.51716215953118,
   'recall': 0.45314659069584945,
   'f1': 0.48304267971122544,
   'roc_auc': 0.5176608195666089,
   'gini_index': 0.03532163913321784}},
 {'rtn_1': 62.0,
  'rtn_2': 56.0,
  'rtn_3': 33.0,
  'rtn_4': 27.0,
  'rtn_5': 40.0,
  'rtn_6': 48.0,
  'rtn_7': 29.0,
  'rtn_8': 22.0,
  'rtn_9': 29.0,
  'rtn_10': 39.0,
  'rtn_11': 24.0,
  'rtn_12': 17.0,
  'rtn_13': 18.0,
  'rtn_14': 23.0,
  'rtn_15': 15.0,
  'rtn_16': 27.0,
  'rtn_17': 19.0,
  'rtn_18': 25.0,
  'rtn_19': 23.0,
  'rtn_20': 18.0,
  'rtn_40': 25.0,
  'rtn_60': 29.0,
  'rtn_80': 38.0,
  'rtn_100': 22.0,
  'rtn_120': 31.0,
  'rtn_140': 39.0,
  'rtn_160': 30.0,
  'rtn_180': 25.0,
  'rtn_200': 35.0,
  'rtn_220': 27.0,
  'rtn_240': 41.0})

In [7]:
from tqdm import tqdm

for period in tqdm(range(25)):
    model = xgboost_single_period(period)

100%|██████████| 25/25 [01:02<00:00,  2.51s/it]


In [8]:
performance_stats_df = pd.DataFrame(performance_stats)
performance_stats_df.to_csv(f"{save_path}/xgboost_performance_stats.csv")

In [9]:
feature_importance_df = pd.DataFrame(feature_importance)
feature_importance_df["mean"] = feature_importance_df.mean(axis=1)
feature_importance_df = feature_importance_df.sort_values("mean", ascending=False)
feature_importance_df.to_csv(f"{save_path}/xgboost_feature_importance.csv")

In [10]:
all_df = pd.concat([pd.read_parquet(f"{save_path}/xgboost_period_{i}.parquet") for i in range(25)])
all_df.to_parquet(f"{save_path}/xgboost_all_periods.parquet")

In [11]:
all_df["Date"].nunique()

6250