# Baseline Random Forest Training on CRPWarner Dataset

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import time
import json
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier


## Const

In [3]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/processed')
MODEL_PATH = os.path.join(PATH, 'models')

In [4]:
with open(os.path.join(DATA_PATH, 'feature-opcode-n-gram_list.json')) as f:
    feature_list = json.load(f)

with open(os.path.join(DATA_PATH, 'labels-opcode-n-gram.json')) as f:
    labels = json.load(f)

In [5]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train-opcode-n-gram.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test-opcode-n-gram.csv'))

X_train = train_df[feature_list]
y_train = train_df[labels]

X_test = test_df[feature_list]
y_test = test_df[labels]

## Traditional Machine Learning Models

In [6]:
models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    "Random Forest": OneVsRestClassifier(RandomForestClassifier()),
    "Gradient Boosting": OneVsRestClassifier(GradientBoostingClassifier()),
    "AdaBoost": OneVsRestClassifier(AdaBoostClassifier()),
    "SVM (Linear)": OneVsRestClassifier(SVC(kernel="linear")),
    "KNN": OneVsRestClassifier(KNeighborsClassifier()),
    "Naive Bayes": OneVsRestClassifier(GaussianNB()),
    "MLP Classifier": OneVsRestClassifier(MLPClassifier(max_iter=300)),
    "XGBoost": OneVsRestClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    "LightGBM": OneVsRestClassifier(LGBMClassifier()),
    "DecisionTree": OneVsRestClassifier(DecisionTreeClassifier())
}

In [7]:
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)

    results.append({
        "Classifier": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro"),  # change to 'macro' if multi-class
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1-Score": f1_score(y_test, y_pred, average="macro"),
        "Training Time": round(end - start, 3)
    })

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002117 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9007
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 1157
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 6, number of negative: 49
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9007
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 1157
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.109091 -> initscore=-2.100061
[LightGBM] [Info]

### Result

In [8]:
df = pd.DataFrame(results)
df.sort_values(by="F1-Score", ascending=False, inplace=True)
df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,Training Time
8,XGBoost,0.428571,0.755556,0.425926,0.526263,20.341
9,LightGBM,0.285714,0.607143,0.425926,0.5,0.605
5,KNN,0.214286,0.444444,0.425926,0.422222,0.147
6,Naive Bayes,0.5,0.666667,0.296296,0.404762,0.175
7,MLP Classifier,0.357143,0.458333,0.351852,0.37465,5.069
0,Logistic Regression,0.357143,0.5,0.296296,0.37037,3.503
4,SVM (Linear),0.357143,0.5,0.296296,0.37037,0.247
2,Gradient Boosting,0.5,0.5,0.277778,0.353846,5.276
10,DecisionTree,0.5,0.45,0.277778,0.335664,0.217
3,AdaBoost,0.214286,0.541667,0.240741,0.291317,1.893


### Tuning

In [9]:
# 1. Optuna objective with AdaBoost inside MultiOutputClassifier
def objective(trial):
    model = OneVsRestClassifier(XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 10, 1000),
        max_depth=trial.suggest_int("max_depth", 1, 10),
        learning_rate=trial.suggest_float("learning_rate", 0.0000001, 1.0, log=True),
        subsample=trial.suggest_float("subsample", 0.1, 1.0),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.1, 1.0),
        gamma=trial.suggest_float("gamma", 0, 10),
        reg_alpha=trial.suggest_float("reg_alpha", 0.0, 10.0),
        reg_lambda=trial.suggest_float("reg_lambda", 0.0, 10.0),
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42,
    ))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="macro", zero_division=0)

# 2. Optimize XGBClassifier
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 3. Build base classifiers
model = OneVsRestClassifier(XGBClassifier(**study.best_params, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Tuned XGBClassifier (MultiOutput):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro", zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average="macro", zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, average="macro", zero_division=0))


[I 2025-07-11 17:00:35,555] A new study created in memory with name: no-name-7f78077c-47c6-46c5-9067-8270b1b1831a
[I 2025-07-11 17:01:45,918] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 522, 'max_depth': 3, 'learning_rate': 0.00010413723993013363, 'subsample': 0.7208802745262841, 'colsample_bytree': 0.19807703056810977, 'gamma': 3.1427781604835303, 'reg_alpha': 1.8537335026372714, 'reg_lambda': 4.273180569383239}. Best is trial 0 with value: 0.0.
[I 2025-07-11 17:02:33,093] Trial 1 finished with value: 0.0 and parameters: {'n_estimators': 274, 'max_depth': 5, 'learning_rate': 0.11110536454975237, 'subsample': 0.691104300513733, 'colsample_bytree': 0.9567922212325339, 'gamma': 7.22887023943879, 'reg_alpha': 8.40155457914445, 'reg_lambda': 3.7093183301749164}. Best is trial 0 with value: 0.0.
[I 2025-07-11 17:04:18,288] Trial 2 finished with value: 0.0 and parameters: {'n_estimators': 874, 'max_depth': 2, 'learning_rate': 7.200481640313565e-07, 'subsample': 0.550432

Tuned XGBClassifier (MultiOutput):
Accuracy: 0.35714285714285715
Precision: 0.7916666666666666
Recall: 0.46296296296296297
F1 Score: 0.5627450980392158


In [10]:
joblib.dump(model, os.path.join(MODEL_PATH, f'best_xgboost-ovr_model_on_crpwarner_opcode_n_gram.pkl'))

['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_xgboost-ovr_model_on_crpwarner_opcode_n_gram.pkl']

### K-Fold (K=3)

In [11]:
NUM_FOLDS = 3
results = []
best_model = OneVsRestClassifier(XGBClassifier(**study.best_params, random_state=42))
best_f1 = 0
best_fold = 0

In [12]:
for fold in range(NUM_FOLDS):
    print(f"=========== Fold-{fold} ===========")
    train_path = os.path.join(DATA_PATH, f'train_fold_{fold}-opcode-n-gram.csv')
    val_path = os.path.join(DATA_PATH, f'val_fold_{fold}-opcode-n-gram.csv')

    train_df = pd.read_csv(train_path)
    val_df   = pd.read_csv(val_path)

    X_train = train_df[feature_list]
    y_train = train_df[labels]

    X_val = val_df[feature_list]
    y_val = val_df[labels]

    # Train model
    model = OneVsRestClassifier(XGBClassifier(**study.best_params, random_state=42))
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    report = classification_report(y_val, y_pred, target_names=labels, output_dict=True)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro", zero_division=0)

    results.append({'fold': fold, 'accuracy': acc, 'report': report})
    print(f"Accuracy: {acc}")
    print("Precision:", precision_score(y_val, y_pred, average="macro", zero_division=0))
    print("Recall:", recall_score(y_val, y_pred, average="macro", zero_division=0))
    print("F1 Score:", f1)

    if best_f1 < f1:
        best_model = model
        best_fold = fold
## Step 6: Average Performance Summary
print("\n===== Overall Summary =====")
avg_acc = sum([r['accuracy'] for r in results]) / NUM_FOLDS
print(f"Average Accuracy: {avg_acc:.4f}")

# Save model
joblib.dump(best_model, os.path.join(MODEL_PATH, f'best_xgboost-ovr_model_on_crpwarner_opcode_n_gram_from_fold{best_fold}.pkl'))



Accuracy: 0.5652173913043478
Precision: 0.375
Recall: 0.35714285714285715
F1 Score: 0.362962962962963
Accuracy: 0.5217391304347826
Precision: 0.3428571428571428
Recall: 0.4666666666666666
F1 Score: 0.3938461538461538
Accuracy: 0.5652173913043478
Precision: 0.5
Recall: 0.3428571428571428
F1 Score: 0.4038461538461539

===== Overall Summary =====
Average Accuracy: 0.5507


['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_xgboost-ovr_model_on_crpwarner_opcode_n_gram_from_fold2.pkl']