# COMP8325 Group Project - Model 2 and Task 2 Evaluation
This notebook includes:
- Training XGBoost (Model 2) on the BODMAS dataset
- Saving the model
- Evaluating on holdout dataset
- Generating predictions and performance metrics

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import joblib
print("🚀 Starting Model 2 training and Task 2 evaluation...")

🚀 Starting Model 2 training and Task 2 evaluation...


In [5]:
# Load training data
train_data = np.load('bodmas_train_test.npz')
X_all = train_data['X']
y_binary = train_data['y']

metadata = pd.read_csv('bodmas_metadata_train_test.csv')
category_labels = pd.read_csv('bodmas_malware_category.csv')

# Fix: rename metadata column to match category_labels for merge
metadata = metadata.rename(columns={'sha': 'sha256'})
metadata = metadata.dropna(subset=['sha256'])
category_labels = category_labels.dropna(subset=['sha256'])
merged = metadata.merge(category_labels, on='sha256', how='inner')

X = X_all[merged.index]
y = merged['category'].astype('category').cat.codes
class_names = merged['category'].astype('category').cat.categories

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

In [11]:
#  Final XGBoost training block (clean, no warnings)
xgb_clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    max_depth=4,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    eval_metric='mlogloss',
    random_state=42
)

# Train the model
xgb_clf.fit(X_train, y_train)

# Save the model
import joblib
joblib.dump(xgb_clf, 'model2_xgboost.joblib')
best_model = xgb_clf

print("✅ Model trained and saved as 'model2_xgboost.joblib'")


✅ Model trained and saved as 'model2_xgboost.joblib'


In [13]:
# Load holdout dataset
holdout_data = np.load('bodmas_holdout.npz')
print("📂 Holdout file keys:", holdout_data.files)

keys = holdout_data.files
if len(keys) < 1:
    raise SystemExit("❌ Holdout file is empty.")
elif len(keys) == 1:
    X_holdout = holdout_data[keys[0]]
    y_holdout = None
    print("⚠️ Only features found. No labels available for evaluation.")
else:
    X_holdout = holdout_data[keys[0]]
    y_holdout = holdout_data[keys[1]]

X_holdout_scaled = scaler.transform(X_holdout)
y_pred = best_model.predict(X_holdout_scaled)
np.savetxt("model2_holdout_predictions.txt", y_pred, fmt='%d')
print("📄 Predictions saved to 'model2_holdout_predictions.txt'")

📂 Holdout file keys: ['X', 'y']
📄 Predictions saved to 'model2_holdout_predictions.txt'


In [27]:
# Evaluation if labels are available
if y_holdout is not None:
    print("\n📊 Evaluation on Holdout Set:\n")
    acc = accuracy_score(y_holdout, y_pred)
    print(f"Accuracy: {acc:.4f}\n")

    from sklearn.utils.multiclass import unique_labels

    labels_in_holdout = unique_labels(y_holdout, y_pred)
    class_names_subset = [class_names[i] for i in labels_in_holdout]

    print("Classification Report:\n")
    print(classification_report(
        y_holdout, y_pred,
        labels=labels_in_holdout,
        target_names=class_names_subset,
        zero_division=0  # ✅ no undefined metric warnings
    ))

    cm = confusion_matrix(y_holdout, y_pred, labels=labels_in_holdout)
    print("Confusion Matrix:\n", cm)

    # ✅ Fix division by zero in TPR
    with np.errstate(divide='ignore', invalid='ignore'):
        TPR = np.nan_to_num(np.diag(cm) / np.sum(cm, axis=1))

    FPR = []
    for i in range(len(cm)):
        FP = np.sum(cm[:, i]) - cm[i, i]
        TN = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i])
        FPR.append(FP / (FP + TN) if (FP + TN) != 0 else 0.0)

    print("\nTPR per class:")
    for i, rate in enumerate(TPR):
        print(f"{class_names_subset[i]}: {rate:.4f}")

    print("\nFPR per class:")
    for i, rate in enumerate(FPR):
        print(f"{class_names_subset[i]}: {rate:.4f}")
else:
    print("⚠️ No labels found in holdout file. Only predictions generated.")




📊 Evaluation on Holdout Set:

Accuracy: 0.0002

Classification Report:

              precision    recall  f1-score   support

    backdoor       1.00      0.00      0.00     18363
 cryptominer       0.00      0.00      0.00     16072
  downloader       0.00      0.00      0.00         0
      trojan       0.00      0.00      0.00         0
        worm       0.00      0.00      0.00         0

    accuracy                           0.00     34435
   macro avg       0.20      0.00      0.00     34435
weighted avg       0.53      0.00      0.00     34435

Confusion Matrix:
 [[    7     0     4 18166   186]
 [    0     0     0 15820   252]
 [    0     0     0     0     0]
 [    0     0     0     0     0]
 [    0     0     0     0     0]]

TPR per class:
backdoor: 0.0004
cryptominer: 0.0000
downloader: 0.0000
trojan: 0.0000
worm: 0.0000

FPR per class:
backdoor: 0.0000
cryptominer: 0.0000
downloader: 0.0001
trojan: 0.9870
worm: 0.0127
