In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import os
from joblib import dump

save_path = "./result/rf"
model_path = "./data/model_data/models/rf"
os.makedirs(save_path, exist_ok=True)
os.makedirs(model_path, exist_ok=True)

In [2]:
feature_path = "./data/model_data/period_features"

def load_feature(num_period: int, feature_path: str = feature_path) -> pd.DataFrame:
    data = pd.read_parquet(f"{feature_path}/features_period_{num_period}.parquet")

    date_range = data["Date"].sort_values().unique()
    data = data.set_index("Date")

    train_data = data.loc[date_range[:-250]]
    test_data = data.loc[date_range[-250:]]

    return train_data.reset_index(), test_data.reset_index()

In [3]:
performance_stats = {}

In [4]:
def randomforest_single_period(period):
    train_data, test_data = load_feature(period)
    X_train = train_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_train = train_data["Target"]
    X_test = test_data.drop(columns=["Date", "Ticker", "Return_tomorrow", "Target"])
    y_test = test_data["Target"]

    params = {
        "n_estimators": 1000,
        "max_depth": 20,
        "max_features": "sqrt",
        "n_jobs": -1,
    }

    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    performance_stats[period] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_pred),
        "gini_index": 2 * roc_auc_score(y_test, y_pred) - 1,
    }

    result_df = test_data[["Date", "Ticker", "Return_tomorrow", "Target"]].copy()
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    result_df["pred"] = y_pred
    result_df["pred_proba"] = y_pred_proba

    result_df.to_parquet(f"{save_path}/rf_period_{period}.parquet")
    dump(model, f"{model_path}/rf_period_{period}.joblib")
    return model

In [5]:
model = randomforest_single_period(1)
print(performance_stats)

{1: {'accuracy': 0.5151982640242765, 'precision': 0.5131552223924964, 'recall': 0.4741295484399256, 'f1': 0.49287107643199146, 'roc_auc': 0.5149436386795702, 'gini_index': 0.02988727735914032}}


In [6]:
from tqdm import tqdm

for period in tqdm(range(25)):
    model = randomforest_single_period(period)

  0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [1:02:47<00:00, 150.70s/it]


In [7]:
performance_stats_df = pd.DataFrame(performance_stats)
performance_stats_df.to_csv(f"{save_path}/rf_performance_stats.csv")

In [8]:
all_df = pd.concat([pd.read_parquet(f"{save_path}/rf_period_{i}.parquet") for i in range(25)])
all_df.to_parquet(f"{save_path}/rf_all_periods.parquet")

In [9]:
all_df["Date"].nunique()

6250