In [1]:
import pandas as pd
import numpy as np
import os

result_path = "./result"

In [2]:
boosting_models = ["gbdt", "xgboost", "lightgbm"]
fixeded_models = ["nn", "rf"]
selected_models = fixeded_models + [boosting_models[0]]
print(selected_models)

['nn', 'rf', 'gbdt']


In [16]:
gini_df = pd.DataFrame()
for model in boosting_models + fixeded_models:
    path = os.path.join(result_path, model, model + "_performance_stats.csv")
    df = pd.read_csv(path, index_col=0)
    df.columns = [int(i) for i in df.columns]
    gini_df[model] = df.loc["gini_index"].sort_index()

gini_df.index.name = "period"
gini_df.head()

Unnamed: 0_level_0,gbdt,xgboost,lightgbm,nn,rf
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.030096,0.029776,0.026946,0.040434,0.025083
1,0.033587,0.035322,0.030827,0.043068,0.029817
2,0.024686,0.02539,0.023895,0.035488,0.022454
3,0.020979,0.020007,0.023062,0.034932,0.022003
4,0.014454,0.019216,0.01774,0.025295,0.014702


In [20]:
file_paths = [os.path.join(result_path, "gbdt", f"gbdt_period_{i}.parquet") for i in range(25)]

period_date_range = {}

for period in range(25):
    path = os.path.join(result_path, "gbdt", f"gbdt_period_{period}.parquet")
    df = pd.read_parquet(path)
    period_date_range[period] = {"start_date": df["Date"].min(), "end_date": df["Date"].max()}

period_date_range_df = pd.DataFrame(period_date_range).T
period_date_range_df.head()

Unnamed: 0,start_date,end_date
0,1999-06-25,2000-06-20
1,2000-06-21,2001-06-18
2,2001-06-19,2002-06-20
3,2002-06-21,2003-06-18
4,2003-06-19,2004-06-16


In [21]:
gini_df = gini_df.join(period_date_range_df)
gini_df.head()

Unnamed: 0_level_0,gbdt,xgboost,lightgbm,nn,rf,start_date,end_date
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.030096,0.029776,0.026946,0.040434,0.025083,1999-06-25,2000-06-20
1,0.033587,0.035322,0.030827,0.043068,0.029817,2000-06-21,2001-06-18
2,0.024686,0.02539,0.023895,0.035488,0.022454,2001-06-19,2002-06-20
3,0.020979,0.020007,0.023062,0.034932,0.022003,2002-06-21,2003-06-18
4,0.014454,0.019216,0.01774,0.025295,0.014702,2003-06-19,2004-06-16


In [22]:
gini_df.to_csv(os.path.join(result_path, "gini_index.csv"))

In [59]:
def ensemble_predict(selected_models):
    # merge the original probabilities
    ensemble_df = pd.DataFrame()
    for model in selected_models:
        path = os.path.join(result_path, model, model + "_all_periods.parquet")
        df = pd.read_parquet(path)
        df = df.rename(columns={"pred_proba": model + "_proba"})
        if ensemble_df.empty:
            ensemble_df = df.drop(columns=["pred"])
        else:
            ensemble_df = pd.merge(ensemble_df, df.drop(columns=["pred", "Return_tomorrow", "Target"]), on=["Date", "Ticker"])
    ensemble_df = ensemble_df.set_index("Date")

    # add period information
    for period, gini_data in gini_df.iterrows():
        start_date = gini_data["start_date"]
        end_date = gini_data["end_date"]
        ensemble_df.loc[start_date:end_date, "period"] = period

    # add gini index of every period
    for model in selected_models:
        ensemble_df[model + "_gini"] = ensemble_df["period"].map(gini_df[model])
    
    # mean ensemble
    ensemble_df["mean_proba"] = ensemble_df[[model + "_proba" for model in selected_models]].mean(axis=1)
    ensemble_df["mean_pred"] = (ensemble_df["mean_proba"] > 0.5).astype(int)

    # gini ensemble
    multiplier = ensemble_df[[model + "_gini" for model in selected_models]].values / ensemble_df[[model + "_gini" for model in selected_models]].sum(axis=1).values.reshape(-1, 1)
    ensemble_df["gini_proba"] = (ensemble_df[[model + "_proba" for model in selected_models]].values * multiplier).sum(axis=1)
    ensemble_df["gini_pred"] = (ensemble_df["gini_proba"] > 0.5).astype(int)

    # gini rank ensemble
    arg_idx = np.argsort(ensemble_df[[model + "_gini" for model in selected_models]].values, axis=1)
    multiplier = np.zeros_like(arg_idx, dtype=float)
    for i in range(len(arg_idx)):
        multiplier[i][arg_idx[i]] = np.arange(len(selected_models), 0, -1)
    multiplier = (1 / multiplier) / (1 / multiplier).sum(axis=1).reshape(-1, 1)
    ensemble_df["gini_rank_proba"] = (ensemble_df[[model + "_proba" for model in selected_models]].values * multiplier).sum(axis=1)
    ensemble_df["gini_rank_pred"] = (ensemble_df["gini_rank_proba"] > 0.5).astype(int)

    # save the ensemble result
    ensemble_df = ensemble_df.drop(columns=["period"]).reset_index()
    save_name = ("_").join(selected_models) + "_ensemble.parquet"
    ensemble_df.to_parquet(os.path.join(result_path, save_name))
    return ensemble_df

In [62]:
from tqdm import tqdm

for i in tqdm(range(3)):
    selected_models = fixeded_models + [boosting_models[i]]
    ensemble_df = ensemble_predict(selected_models)

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:46<00:00, 15.39s/it]


In [63]:
save_name = ("_").join(selected_models) + "_ensemble.parquet"
pd.read_parquet(os.path.join(result_path, save_name)).head()

Unnamed: 0,Date,Ticker,Return_tomorrow,Target,nn_proba,rf_proba,lightgbm_proba,nn_gini,rf_gini,lightgbm_gini,mean_proba,mean_pred,gini_proba,gini_pred,gini_rank_proba,gini_rank_pred
0,1999-06-25,AAPL,0.008848,1,0.521082,0.528359,0.533897,0.040434,0.025083,0.026946,0.527779,1,0.526791,1,0.5259,1
1,1999-06-25,ABMD,-0.027577,0,0.479983,0.483227,0.480927,0.040434,0.025083,0.026946,0.481379,0,0.481138,0,0.48083,0
2,1999-06-25,ABT,0.007118,0,0.496939,0.505681,0.503653,0.040434,0.025083,0.026946,0.502091,1,0.501267,1,0.50036,1
3,1999-06-25,ACGL,-0.037271,0,0.47831,0.490207,0.486377,0.040434,0.025083,0.026946,0.484965,0,0.483888,0,0.482673,0
4,1999-06-25,ADBE,0.026629,1,0.473703,0.446274,0.468821,0.040434,0.025083,0.026946,0.462933,0,0.464839,0,0.467384,0


In [64]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_true = ensemble_df["Target"]
for method in ["mean", "gini", "gini_rank"]:
    acc = accuracy_score(y_true, ensemble_df[method + "_pred"])
    auc = roc_auc_score(y_true, ensemble_df[method + "_proba"])
    print(f"{method} accuracy: {acc}, auc: {auc}")

mean accuracy: 0.5110735066328003, auc: 0.5175414724229777
gini accuracy: 0.5112314488827134, auc: 0.5177449877817263
gini_rank accuracy: 0.5113568736105857, auc: 0.5176006937712501
