# COMP8325 Group Project - Model 2 and Task 2 Evaluation
This notebook includes:
- Training XGBoost (Model 2) on the BODMAS dataset
- Saving the model
- Evaluating on holdout dataset
- Generating predictions and performance metrics

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import xgboost as xgb
import joblib
print("🚀 Starting Model 2 training and Task 2 evaluation...")

🚀 Starting Model 2 training and Task 2 evaluation...


In [3]:
# Load training data
train_data = np.load('bodmas_train_test.npz')
X_all = train_data['X']
y_binary = train_data['y']

metadata = pd.read_csv('bodmas_metadata_train_test.csv')
category_labels = pd.read_csv('bodmas_malware_category.csv')

# Fix: rename metadata column to match category_labels for merge
metadata = metadata.rename(columns={'sha': 'sha256'})
metadata = metadata.dropna(subset=['sha256'])
category_labels = category_labels.dropna(subset=['sha256'])
merged = metadata.merge(category_labels, on='sha256', how='inner')

X = X_all[merged.index]
y = merged['category'].astype('category').cat.codes
class_names = merged['category'].astype('category').cat.categories

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

In [4]:
#  Final XGBoost training block (clean, no warnings)
xgb_clf = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y)),
    max_depth=4,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    eval_metric='mlogloss',
    random_state=42
)

# Train the model
xgb_clf.fit(X_train, y_train)

# Save the model
import joblib
joblib.dump(xgb_clf, 'model2_xgboost.joblib')
best_model = xgb_clf

print("✅ Model trained and saved as 'model2_xgboost.joblib'")


✅ Model trained and saved as 'model2_xgboost.joblib'


In [5]:
# Load holdout dataset
holdout_data = np.load('bodmas_holdout.npz')
print("📂 Holdout file keys:", holdout_data.files)

keys = holdout_data.files
if len(keys) < 1:
    raise SystemExit("❌ Holdout file is empty.")
elif len(keys) == 1:
    X_holdout = holdout_data[keys[0]]
    y_holdout = None
    print("⚠️ Only features found. No labels available for evaluation.")
else:
    X_holdout = holdout_data[keys[0]]
    y_holdout = holdout_data[keys[1]]

X_holdout_scaled = scaler.transform(X_holdout)
y_pred = best_model.predict(X_holdout_scaled)
np.savetxt("model2_holdout_predictions.txt", y_pred, fmt='%d')
print("📄 Predictions saved to 'model2_holdout_predictions.txt'")

📂 Holdout file keys: ['X', 'y']
📄 Predictions saved to 'model2_holdout_predictions.txt'


In [6]:
# Evaluation if labels are available
if y_holdout is not None:
    print("\n📊 Evaluation on Holdout Set:\n")
    acc = accuracy_score(y_holdout, y_pred)
    print(f"Accuracy: {acc:.4f}\n")

    from sklearn.utils.multiclass import unique_labels

    labels_in_holdout = unique_labels(y_holdout, y_pred)
    class_names_subset = [class_names[i] for i in labels_in_holdout]

    print("Classification Report:\n")
    print(classification_report(
        y_holdout, y_pred,
        labels=labels_in_holdout,
        target_names=class_names_subset,
        zero_division=0  # ✅ no undefined metric warnings
    ))

    cm = confusion_matrix(y_holdout, y_pred, labels=labels_in_holdout)
    print("Confusion Matrix:\n", cm)

    # ✅ Fix division by zero in TPR
    with np.errstate(divide='ignore', invalid='ignore'):
        TPR = np.nan_to_num(np.diag(cm) / np.sum(cm, axis=1))

    FPR = []
    for i in range(len(cm)):
        FP = np.sum(cm[:, i]) - cm[i, i]
        TN = np.sum(cm) - (np.sum(cm[i, :]) + np.sum(cm[:, i]) - cm[i, i])
        FPR.append(FP / (FP + TN) if (FP + TN) != 0 else 0.0)

    print("\nTPR per class:")
    for i, rate in enumerate(TPR):
        print(f"{class_names_subset[i]}: {rate:.4f}")

    print("\nFPR per class:")
    for i, rate in enumerate(FPR):
        print(f"{class_names_subset[i]}: {rate:.4f}")
else:
    print("⚠️ No labels found in holdout file. Only predictions generated.")




📊 Evaluation on Holdout Set:

Accuracy: 0.0004

Classification Report:

              precision    recall  f1-score   support

    backdoor       1.00      0.00      0.00     18363
 cryptominer       0.00      0.00      0.00     16072
  downloader       0.00      0.00      0.00         0
    p2p-worm       0.00      0.00      0.00         0
      trojan       0.00      0.00      0.00         0
        worm       0.00      0.00      0.00         0

    accuracy                           0.00     34435
   macro avg       0.17      0.00      0.00     34435
weighted avg       0.53      0.00      0.00     34435

Confusion Matrix:
 [[   15     0     2     2 18108   236]
 [    0     0     0     0 15940   132]
 [    0     0     0     0     0     0]
 [    0     0     0     0     0     0]
 [    0     0     0     0     0     0]
 [    0     0     0     0     0     0]]

TPR per class:
backdoor: 0.0008
cryptominer: 0.0000
downloader: 0.0000
p2p-worm: 0.0000
trojan: 0.0000
worm: 0.0000

FPR per clas

In [7]:
import numpy as np
import pandas as pd

# Load features and labels
data = np.load("bodmas_train_test.npz")   # tested with bodmas_train_test.npz and bodmas_holdout.npz
X = data['X']  # feature vectors
y_binary = data['y']  # 0=benign, 1=malware

# Load metadata and category
metadata = pd.read_csv("bodmas_metadata_train_test.csv")  # has 'sha' #tested with bodmas_metadata_train_test.csv and bodmas_metadata_holdout.csv
categories = pd.read_csv("bodmas_malware_category.csv")   # has 'sha256'

# Merge on correct columns
metadata = metadata.merge(categories, left_on='sha', right_on='sha256', how='left')
metadata['category'] = metadata['category'].fillna('benign')
metadata.drop(columns=['sha256'], inplace=True)

# Combine with features
df = pd.DataFrame(X)
df['category'] = metadata['category']

In [8]:
#Check Shape of Features and Labels

print("Feature matrix shape:", X.shape)
print("Binary label shape:", y_binary.shape)

Feature matrix shape: (100000, 2381)
Binary label shape: (100000,)


In [9]:
#Check First Few Rows of Metadata
print(metadata.head())

                                                 sha  \
0  e6d7b4bab32def853ab564410df53fa33172dda1bfd48c...   
1  5af37a058a5bcf2284c183ee98d92b7c66d8f5ce623e92...   
2  5bfbbea150af5cef2d3a93b80ef7c7faea9f564b56045d...   
3  216f592f1e1717d5681b7f5f2b14a28a2f0c603b5b7318...   
4  a1ca76813d2e9e7e23b830c87fbe29bcb51fcbe096e445...   

                   timestamp family category  
0  2007-01-01 08:46:39+00:00    NaN   benign  
1  2007-01-26 17:16:30+00:00    NaN   benign  
2  2007-03-21 02:08:53+00:00    NaN   benign  
3  2007-04-25 12:55:06+00:00    NaN   benign  
4  2007-11-14 15:03:55+00:00    NaN   benign  


In [10]:

# Remove classes with only 1 sample
counts = df['category'].value_counts()
valid_classes = counts[counts > 1].index
df = df[df['category'].isin(valid_classes)]

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode category labels to integers
le = LabelEncoder()
y = le.fit_transform(df['category'])
X_features = df.drop(columns=['category'])

# Create train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, stratify=y, random_state=42
)

In [12]:
#trying Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=le.classes_))


print(classification_report(
    y_test,
    y_pred,
    labels=range(len(le.classes_)),  # handle missing class in test set
    target_names=le.classes_,
    zero_division=0  # avoids warnings if precision/recall can't be computed
))

                    precision    recall  f1-score   support

          backdoor       0.92      0.96      0.94       933
            benign       0.99      1.00      0.99     11756
       cryptominer       0.67      0.50      0.57         4
        downloader       0.93      0.81      0.87       195
           dropper       0.77      0.76      0.76       116
           exploit       1.00      1.00      1.00         2
informationstealer       0.93      0.80      0.86        46
          p2p-worm       0.00      0.00      0.00         1
               pua       0.00      0.00      0.00         4
        ransomware       0.99      0.86      0.92       143
           rootkit       1.00      1.00      1.00         1
            trojan       0.95      0.93      0.94      4650
  trojan-gamethief       0.00      0.00      0.00         1
             virus       0.94      0.76      0.84        38
              worm       0.94      0.95      0.94      2110

          accuracy                    

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# --- Train Random Forest Model ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# --- Predict ---
y_pred_rf = rf.predict(X_test)

# --- Print Overall Accuracy ---
print("Random Forest - Overall Accuracy:", accuracy_score(y_test, y_pred_rf))

# --- Function to Evaluate Top 10 Classes ---
def limited_accuracy_by_class(y_true, y_pred, class_names, top_n=10):
    counts = pd.Series(y_true).value_counts().nlargest(top_n)
    top_classes = counts.index.tolist()
    cm = confusion_matrix(y_true, y_pred, labels=top_classes)
    output = []

    for i, class_idx in enumerate(top_classes):
        class_name = class_names[class_idx]
        TP = cm[i, i]
        FN = cm[i].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = cm.sum() - (TP + FN + FP)
        acc = (TP + TN) / cm.sum()
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        output.append([class_name, round(acc, 2), round(tpr, 2), round(fpr, 2)])

    return pd.DataFrame(output, columns=["Class", "Accuracy", "TPR", "FPR"])

# --- Display Evaluation for Top 10 Classes ---
top10_rf_report = limited_accuracy_by_class(y_test, y_pred_rf, le.classes_, top_n=10)
print("\nTop 10 Class Evaluation - Random Forest:")
print(top10_rf_report)

Random Forest - Overall Accuracy: 0.9701

Top 10 Class Evaluation - Random Forest:
                Class  Accuracy   TPR   FPR
0              benign      0.99  1.00  0.02
1              trojan      0.97  0.93  0.01
2                worm      0.99  0.95  0.01
3            backdoor      0.99  0.96  0.00
4          downloader      1.00  0.81  0.00
5          ransomware      1.00  0.86  0.00
6             dropper      1.00  0.76  0.00
7  informationstealer      1.00  0.80  0.00
8               virus      1.00  0.76  0.00
9                 pua      1.00  0.00  0.00


In [14]:
#RAMDOM FOREST FOR HOLDOUT DATASET

In [15]:
# Load features and labels
data = np.load("bodmas_holdout.npz")   # tested with bodmas_train_test.npz and bodmas_holdout.npz
X = data['X']  # feature vectors
y_binary = data['y']  # 0=benign, 1=malware

# Load metadata and category
metadata = pd.read_csv("bodmas_metadata_holdout.csv")  # has 'sha' #tested with bodmas_metadata_train_test.csv and bodmas_metadata_holdout.csv
categories = pd.read_csv("bodmas_malware_category.csv")   # has 'sha256'

# Merge on correct columns
metadata = metadata.merge(categories, left_on='sha', right_on='sha256', how='left')
metadata['category'] = metadata['category'].fillna('benign')
metadata.drop(columns=['sha256'], inplace=True)

# Combine with features
df = pd.DataFrame(X)
df['category'] = metadata['category']

In [16]:
# Remove classes with only 1 sample
counts = df['category'].value_counts()
valid_classes = counts[counts > 1].index
df = df[df['category'].isin(valid_classes)]

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode category labels to integers
le = LabelEncoder()
y = le.fit_transform(df['category'])
X_features = df.drop(columns=['category'])

# Create train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, stratify=y, random_state=42
)

In [18]:
#trying Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=le.classes_))


print(classification_report(
    y_test,
    y_pred,
    labels=range(len(le.classes_)),  # handle missing class in test set
    target_names=le.classes_,
    zero_division=0  # avoids warnings if precision/recall can't be computed
))

                    precision    recall  f1-score   support

          backdoor       0.98      0.98      0.98       533
            benign       0.99      1.00      1.00      3673
        downloader       0.83      0.45      0.59        11
           dropper       0.86      0.70      0.78        27
informationstealer       0.95      0.93      0.94        43
          p2p-worm       0.00      0.00      0.00         3
               pua       0.00      0.00      0.00         2
        ransomware       0.94      0.81      0.87        21
            trojan       0.97      0.95      0.96      1345
  trojan-gamethief       0.00      0.00      0.00         0
              worm       0.97      0.98      0.97      1229

         micro avg       0.98      0.98      0.98      6887
         macro avg       0.68      0.62      0.64      6887
      weighted avg       0.98      0.98      0.98      6887



In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# --- Train Random Forest Model ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# --- Predict ---
y_pred_rf = rf.predict(X_test)

# --- Print Overall Accuracy ---
print("Random Forest - Overall Accuracy:", accuracy_score(y_test, y_pred_rf))

# --- Function to Evaluate Top 10 Classes ---
def limited_accuracy_by_class(y_true, y_pred, class_names, top_n=10):
    counts = pd.Series(y_true).value_counts().nlargest(top_n)
    top_classes = counts.index.tolist()
    cm = confusion_matrix(y_true, y_pred, labels=top_classes)
    output = []

    for i, class_idx in enumerate(top_classes):
        class_name = class_names[class_idx]
        TP = cm[i, i]
        FN = cm[i].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = cm.sum() - (TP + FN + FP)
        acc = (TP + TN) / cm.sum()
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        output.append([class_name, round(acc, 2), round(tpr, 2), round(fpr, 2)])

    return pd.DataFrame(output, columns=["Class", "Accuracy", "TPR", "FPR"])

# --- Display Evaluation for Top 10 Classes ---
top10_rf_report = limited_accuracy_by_class(y_test, y_pred_rf, le.classes_, top_n=10)
print("\nTop 10 Class Evaluation for holdout dataset - Random Forest:")
print(top10_rf_report)

Random Forest - Overall Accuracy: 0.9814142587483665

Top 10 Class Evaluation for holdout dataset - Random Forest:
                Class  Accuracy   TPR   FPR
0              benign      1.00  1.00  0.01
1              trojan      0.98  0.95  0.01
2                worm      0.99  0.98  0.01
3            backdoor      1.00  0.98  0.00
4  informationstealer      1.00  0.93  0.00
5             dropper      1.00  0.70  0.00
6          ransomware      1.00  0.81  0.00
7          downloader      1.00  0.45  0.00
8            p2p-worm      1.00  0.00  0.00
9                 pua      1.00  0.00  0.00
