In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt

import os

save_path = "./result/lightgbm"
model_path = "./data/model_data/models/lightgbm"
os.makedirs(save_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

In [2]:
feature_path = "./data/model_data/period_features"

def load_feature(num_period: int, feature_path: str = feature_path) -> pd.DataFrame:
    data = pd.read_parquet(f"{feature_path}/features_period_{num_period}.parquet")

    date_range = data["Date"].sort_values().unique()
    data = data.set_index("Date")

    train_data = data.loc[date_range[:-250]]
    test_data = data.loc[date_range[-250:]]

    return train_data.reset_index(), test_data.reset_index()

In [3]:
performance_stats = {}
feature_importance = {}

In [4]:
def lightgbm_single_period(period):
    train_data, test_data = load_feature(period)
    X_train = train_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_train = train_data["Target"]
    X_test = test_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_test = test_data["Target"]

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

    num_round = 100
    bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_eval])

    y_pred_proba = bst.predict(X_test, num_iteration=bst.best_iteration)
    y_pred = (y_pred_proba >= 0.5).astype(int)

    result_df = test_data[["Date", "Ticker", "Return_tomorrow", "Target"]].copy()
    result_df["pred"] = y_pred
    result_df["pred_proba"] = y_pred_proba

    performance_stats[period] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred),
        "gini_index": 2 * roc_auc_score(y_test, y_pred) - 1,
    }

    importance = bst.feature_importance(importance_type="gain")
    feature_names = bst.feature_name()
    feature_importance[period] = dict(zip(feature_names, importance))

    plt.figure(figsize=(12, 6))
    lgb.plot_importance(bst, importance_type="gain", title=f"LightGBM Feature Importance of Period {period}", ax=plt.gca())
    plt.savefig(f"{save_path}/lightgbm_feature_importance_{period}.png")
    plt.close()

    result_df.to_parquet(f"{save_path}/lightgbm_period_{period}.parquet")
    bst.save_model(f"{model_path}/lightgbm_period_{period}.txt")
    return bst

In [5]:
model = lightgbm_single_period(1)
performance_stats, feature_importance[1]

({1: {'accuracy': 0.5158594266533304,
   'precision': 0.5148697045845783,
   'recall': 0.4439004418362647,
   'f1': 0.4767584602136353,
   'roc_auc': 0.5154132821598137,
   'gini_index': 0.030826564319627492}},
 {'rtn_1': 2764.497874736786,
  'rtn_2': 2170.6170325279236,
  'rtn_3': 1432.3311514854431,
  'rtn_4': 1515.7570304870605,
  'rtn_5': 1396.9743785858154,
  'rtn_6': 1497.5354762077332,
  'rtn_7': 988.5767107009888,
  'rtn_8': 863.6320486068726,
  'rtn_9': 1121.8913469314575,
  'rtn_10': 900.7544021606445,
  'rtn_11': 877.1918439865112,
  'rtn_12': 795.1405396461487,
  'rtn_13': 997.7791471481323,
  'rtn_14': 849.6829509735107,
  'rtn_15': 831.118833065033,
  'rtn_16': 794.8215289115906,
  'rtn_17': 812.9528970718384,
  'rtn_18': 975.5619025230408,
  'rtn_19': 723.0829787254333,
  'rtn_20': 809.5252089500427,
  'rtn_40': 1178.5729813575745,
  'rtn_60': 1172.93270778656,
  'rtn_80': 1201.2683601379395,
  'rtn_100': 1226.7164616584778,
  'rtn_120': 1429.833945274353,
  'rtn_140': 1

In [6]:
from tqdm import tqdm

for period in tqdm(range(25)):
    model = lightgbm_single_period(period)

  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [04:56<00:00, 11.88s/it]


In [7]:
performance_stats_df = pd.DataFrame(performance_stats)
performance_stats_df.to_csv(f"{save_path}/lightgbm_performance_stats.csv")

In [8]:
feature_importance_df = pd.DataFrame(feature_importance)
feature_importance_df["mean"] = feature_importance_df.mean(axis=1)
feature_importance_df = feature_importance_df.sort_values("mean", ascending=False)
feature_importance_df.to_csv(f"{save_path}/lightgbm_feature_importance.csv")

In [9]:
all_df = pd.concat([pd.read_parquet(f"{save_path}/lightgbm_period_{i}.parquet") for i in range(25)])
all_df.to_parquet(f"{save_path}/lightgbm_all_periods.parquet")

In [10]:
all_df["Date"].nunique()

6250