# Baseline Random Forest Training on CRPWarner Dataset

## Import Libraries

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
import time
import json
import pandas as pd
import numpy as np
from pathlib import Path
import joblib

import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier


## Const

In [12]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/processed')
MODEL_PATH = os.path.join(PATH, 'models')

In [13]:
with open(os.path.join(DATA_PATH, 'feature-opcode-freq_list.json')) as f:
    feature_list = json.load(f)

with open(os.path.join(DATA_PATH, 'labels-opcode-freq.json')) as f:
    labels = json.load(f)

In [14]:
train_df = pd.read_csv(os.path.join(DATA_PATH, 'train-opcode-freq.csv'))
test_df = pd.read_csv(os.path.join(DATA_PATH, 'test-opcode-freq.csv'))

X_train = train_df[feature_list]
y_train = train_df[labels]

X_test = test_df[feature_list]
y_test = test_df[labels]

## Traditional Machine Learning Models

In [15]:
models = {
    "Logistic Regression": OneVsRestClassifier(LogisticRegression(max_iter=1000)),
    "Random Forest": OneVsRestClassifier(RandomForestClassifier()),
    "Gradient Boosting": OneVsRestClassifier(GradientBoostingClassifier()),
    "AdaBoost": OneVsRestClassifier(AdaBoostClassifier()),
    "SVM (Linear)": OneVsRestClassifier(SVC(kernel="linear")),
    "KNN": OneVsRestClassifier(KNeighborsClassifier()),
    "Naive Bayes": OneVsRestClassifier(GaussianNB()),
    "MLP Classifier": OneVsRestClassifier(MLPClassifier(max_iter=300)),
    "XGBoost": OneVsRestClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    "LightGBM": OneVsRestClassifier(LGBMClassifier()),
    "DecisionTree": OneVsRestClassifier(DecisionTreeClassifier())
}

In [16]:
results = []

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    y_pred = model.predict(X_test)

    results.append({
        "Classifier": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="macro"),  # change to 'macro' if multi-class
        "Recall": recall_score(y_test, y_pred, average="macro"),
        "F1-Score": f1_score(y_test, y_pred, average="macro"),
        "Training Time": round(end - start, 3)
    })

[LightGBM] [Info] Number of positive: 14, number of negative: 41
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 946
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.254545 -> initscore=-1.074515
[LightGBM] [Info] Start training from score -1.074515
[LightGBM] [Info] Number of positive: 6, number of negative: 49
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 946
[LightGBM] [Info] Number of data points in the train set: 55, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScor

### Result

In [17]:
df = pd.DataFrame(results)
df.sort_values(by="F1-Score", ascending=False, inplace=True)
df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-Score,Training Time
10,DecisionTree,0.428571,0.904762,0.407407,0.541667,0.016
3,AdaBoost,0.428571,0.75,0.444444,0.535294,0.241
7,MLP Classifier,0.285714,0.763889,0.407407,0.510893,0.477
1,Random Forest,0.357143,0.904762,0.351852,0.470238,0.317
2,Gradient Boosting,0.428571,0.583333,0.388889,0.451961,0.485
8,XGBoost,0.428571,0.738095,0.351852,0.436905,0.427
5,KNN,0.214286,0.462121,0.425926,0.433333,0.011
9,LightGBM,0.428571,0.722222,0.314815,0.406349,0.04
0,Logistic Regression,0.285714,0.5,0.277778,0.353846,0.422
4,SVM (Linear),0.285714,0.5,0.277778,0.353846,0.021


### Tuning

In [18]:
# 1. Optuna objective with AdaBoost inside MultiOutputClassifier
def objective(trial):
    model = OneVsRestClassifier(AdaBoostClassifier(
        n_estimators = trial.suggest_int("n_estimators", 50, 200),
        learning_rate = trial.suggest_float("learning_rate", 0.00000001, 1.0, log=True),
        random_state=42
    ))
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred, average="macro", zero_division=0)

# 2. Optimize AdaBoost
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# 3. Build base classifiers
model = OneVsRestClassifier(AdaBoostClassifier(**study.best_params, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Tuned AdaBoostClassifier (OneVsRestClassifier):")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="macro", zero_division=0))
print("Recall:", recall_score(y_test, y_pred, average="macro", zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, average="macro", zero_division=0))


[I 2025-07-11 15:29:15,444] A new study created in memory with name: no-name-88172442-3788-4a70-b3ef-7f4f16048c4d


[I 2025-07-11 15:29:16,312] Trial 0 finished with value: 0.4416433239962652 and parameters: {'n_estimators': 138, 'learning_rate': 0.14476662589428113}. Best is trial 0 with value: 0.4416433239962652.
[I 2025-07-11 15:29:17,124] Trial 1 finished with value: 0.4666666666666666 and parameters: {'n_estimators': 125, 'learning_rate': 1.645810049943014e-07}. Best is trial 1 with value: 0.4666666666666666.
[I 2025-07-11 15:29:17,432] Trial 2 finished with value: 0.4666666666666666 and parameters: {'n_estimators': 56, 'learning_rate': 2.1923830083189858e-07}. Best is trial 1 with value: 0.4666666666666666.
[I 2025-07-11 15:29:17,998] Trial 3 finished with value: 0.38888888888888884 and parameters: {'n_estimators': 106, 'learning_rate': 0.006091898342428539}. Best is trial 1 with value: 0.4666666666666666.
[I 2025-07-11 15:29:18,380] Trial 4 finished with value: 0.4666666666666666 and parameters: {'n_estimators': 69, 'learning_rate': 4.9125266084885585e-05}. Best is trial 1 with value: 0.46666

Tuned AdaBoostClassifier (OneVsRestClassifier):
Accuracy: 0.42857142857142855
Precision: 0.49242424242424243
Recall: 0.46296296296296297
F1 Score: 0.4666666666666666


In [19]:
joblib.dump(model, os.path.join(MODEL_PATH, f'best_ada-ovr_model_on_crpwarner_opcode_freq.pkl'))

['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_ada-ovr_model_on_crpwarner_opcode_freq.pkl']

### K-Fold (K=3)

In [20]:
NUM_FOLDS = 3
results = []
best_model = OneVsRestClassifier(AdaBoostClassifier(**study.best_params, random_state=42))
best_f1 = 0
best_fold = 0

In [21]:
for fold in range(NUM_FOLDS):
    print(f"=========== Fold-{fold} ===========")
    train_path = os.path.join(DATA_PATH, f'train_fold_{fold}-opcode-freq.csv')
    val_path = os.path.join(DATA_PATH, f'val_fold_{fold}-opcode-freq.csv')

    train_df = pd.read_csv(train_path)
    val_df   = pd.read_csv(val_path)

    X_train = train_df[feature_list]
    y_train = train_df[labels]

    X_val = val_df[feature_list]
    y_val = val_df[labels]

    # Train model
    model = OneVsRestClassifier(AdaBoostClassifier(**study.best_params, random_state=42))
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    report = classification_report(y_val, y_pred, target_names=labels, output_dict=True)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro", zero_division=0)

    results.append({'fold': fold, 'accuracy': acc, 'report': report})
    print(f"Accuracy: {acc}")
    print("Precision:", precision_score(y_val, y_pred, average="macro", zero_division=0))
    print("Recall:", recall_score(y_val, y_pred, average="macro", zero_division=0))
    print("F1 Score:", f1)

    if best_f1 < f1:
        best_model = model
        best_fold = fold
## Step 6: Average Performance Summary
print("\n===== Overall Summary =====")
avg_acc = sum([r['accuracy'] for r in results]) / NUM_FOLDS
print(f"Average Accuracy: {avg_acc:.4f}")

# Save model
joblib.dump(best_model, os.path.join(MODEL_PATH, f'best_ada-ovr_model_on_crpwarner_opcode_freq_from_fold{best_fold}.pkl'))

Accuracy: 0.5217391304347826
Precision: 0.47685185185185186
Recall: 0.611111111111111
F1 Score: 0.5142857142857142
Accuracy: 0.5652173913043478
Precision: 0.47142857142857136
Recall: 0.6015873015873016
F1 Score: 0.5238095238095238
Accuracy: 0.43478260869565216
Precision: 0.1851851851851852
Recall: 0.3333333333333333
F1 Score: 0.2380952380952381

===== Overall Summary =====
Average Accuracy: 0.5072


['/Users/napatcholthaipanich/Dev/master/dissertation/workspace/ml/models/best_ada-ovr_model_on_crpwarner_opcode_freq_from_fold2.pkl']