# Baseline Random Forest Training on CRPWarner Dataset

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import time
import json
import pandas as pd
from pathlib import Path
import joblib

import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier

In [3]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/processed')
MODEL_PATH = os.path.join(PATH, 'models')

In [4]:
with open(os.path.join(DATA_PATH, 'feature_list.json')) as f:
    feature_list = json.load(f)

with open(os.path.join(DATA_PATH, 'labels.json')) as f:
    labels = json.load(f)

In [5]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

X_train = train_df[feature_list]
y_train = train_df[labels]

X_test = test_df[feature_list]
y_test = test_df[labels]

In [6]:
models = {
    "Logistic Regression": MultiOutputClassifier(LogisticRegression(max_iter=1000)),
    "Random Forest": MultiOutputClassifier(RandomForestClassifier()),
    "Gradient Boosting": MultiOutputClassifier(GradientBoostingClassifier()),
    "AdaBoost": MultiOutputClassifier(AdaBoostClassifier()),
    "SVM (Linear)": MultiOutputClassifier(SVC(kernel="linear")),
    "KNN": MultiOutputClassifier(KNeighborsClassifier()),
    "Naive Bayes": MultiOutputClassifier(GaussianNB()),
    "MLP Classifier": MultiOutputClassifier(MLPClassifier(max_iter=300)),
    "XGBoost": MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    "LightGBM": MultiOutputClassifier(LGBMClassifier())
}

In [7]:
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)

    results.append({
        "Classifier": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro"),  # change to 'macro' if multi-class
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1-Score": f1_score(y_test, y_pred, average="macro"),
        "Training Time": round(end - start, 3)
    })

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 946
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 6, number of negative: 49
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 946
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.109091 -> initscore=-2.100061
[LightGBM] [Info] Start

In [8]:
df = pd.DataFrame(results)
df.sort_values(by="F1-Score", ascending=False, inplace=True)
df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,Training Time
3,AdaBoost,0.428571,0.75,0.444444,0.535294,0.367
2,Gradient Boosting,0.428571,0.583333,0.388889,0.451961,0.719
8,XGBoost,0.428571,0.738095,0.351852,0.436905,0.71
5,KNN,0.214286,0.462121,0.425926,0.433333,0.012
9,LightGBM,0.428571,0.722222,0.314815,0.406349,0.055
7,MLP Classifier,0.357143,0.6,0.259259,0.357143,0.642
4,SVM (Linear),0.285714,0.5,0.277778,0.353846,0.025
0,Logistic Regression,0.285714,0.5,0.277778,0.353846,0.859
1,Random Forest,0.357143,0.571429,0.240741,0.303571,0.505
6,Naive Bayes,0.285714,0.444444,0.203704,0.279202,0.017


In [9]:
# 1. Optuna objective with AdaBoost inside MultiOutputClassifier
def objective(trial):
    model = MultiOutputClassifier(AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 10, 1000),
        learning_rate=trial.suggest_float("learning_rate", 0.0000001, 1.0),
        random_state=42
    ))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="macro", zero_division=0)

# 2. Optimize AdaBoost
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 3. Build base classifiers
model = MultiOutputClassifier(AdaBoostClassifier(**study.best_params, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Tuned AdaBoostClassifier (MultiOutput):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro", zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average="macro", zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, average="macro", zero_division=0))


[I 2025-07-10 22:16:22,039] A new study created in memory with name: no-name-512d6a9c-1c28-475f-95a4-f93329d0332b


[I 2025-07-10 22:16:26,909] Trial 0 finished with value: 0.44973544973544977 and parameters: {'n_estimators': 639, 'learning_rate': 0.12461638969679269}. Best is trial 0 with value: 0.44973544973544977.
[I 2025-07-10 22:16:33,487] Trial 1 finished with value: 0.4369047619047619 and parameters: {'n_estimators': 912, 'learning_rate': 0.9846978228982194}. Best is trial 0 with value: 0.44973544973544977.
[I 2025-07-10 22:16:39,533] Trial 2 finished with value: 0.46386554621848736 and parameters: {'n_estimators': 849, 'learning_rate': 0.9345938247976716}. Best is trial 2 with value: 0.46386554621848736.
[I 2025-07-10 22:16:46,067] Trial 3 finished with value: 0.4878306878306879 and parameters: {'n_estimators': 924, 'learning_rate': 0.6002921923067416}. Best is trial 3 with value: 0.4878306878306879.
[I 2025-07-10 22:16:49,696] Trial 4 finished with value: 0.4507936507936508 and parameters: {'n_estimators': 511, 'learning_rate': 0.37319667670988516}. Best is trial 3 with value: 0.48783068783

Tuned AdaBoostClassifier (MultiOutput):
Accuracy: 0.35714285714285715
Precision: 0.7037037037037037
Recall: 0.5370370370370371
F1 Score: 0.5925925925925926


In [10]:
NUM_FOLDS = 3
results = []

In [11]:
for fold in range(NUM_FOLDS):
    train_path = os.path.join(DATA_PATH, f'train_fold_{fold}.csv')
    val_path = os.path.join(DATA_PATH, f'val_fold_{fold}.csv')

    train_df = pd.read_csv(train_path)
    val_df   = pd.read_csv(val_path)

    X_train = train_df[feature_list]
    y_train = train_df[labels]

    X_val = val_df[feature_list]
    y_val = val_df[labels]

    # Train model
    model = MultiOutputClassifier(AdaBoostClassifier(**study.best_params, random_state=42))
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    report = classification_report(y_val, y_pred, target_names=labels, output_dict=True)
    acc = accuracy_score(y_val, y_pred)

    results.append({'fold': fold, 'accuracy': acc, 'report': report})
    print(f"Accuracy: {acc:.4f}")

    # Save model
    joblib.dump(model, os.path.join(MODEL_PATH, f'model_fold{fold}.pkl'))

## Step 6: Average Performance Summary
print("\n===== Overall Summary =====")
avg_acc = sum([r['accuracy'] for r in results]) / NUM_FOLDS
print(f"Average Accuracy: {avg_acc:.4f}")

Accuracy: 0.5217
Accuracy: 0.3913
Accuracy: 0.5652

===== Overall Summary =====
Average Accuracy: 0.4928
