In [134]:
#hello team, this is our source code file for our project

In [135]:
import numpy as np
import pandas as pd

# Load features and labels
data = np.load("bodmas_holdout.npz")   # tested with bodmas_train_test.npz
X = data['X']  # feature vectors
y_binary = data['y']  # 0=benign, 1=malware

# Load metadata and category
metadata = pd.read_csv("bodmas_metadata_holdout.csv")  # has 'sha' #tested with bodmas_metadata_train_test.csv
categories = pd.read_csv("bodmas_malware_category.csv")   # has 'sha256'

# Merge on correct columns
metadata = metadata.merge(categories, left_on='sha', right_on='sha256', how='left')
metadata['category'] = metadata['category'].fillna('benign')
metadata.drop(columns=['sha256'], inplace=True)

# Combine with features
df = pd.DataFrame(X)
df['category'] = metadata['category']



In [136]:
#Check Shape of Features and Labels

print("Feature matrix shape:", X.shape)
print("Binary label shape:", y_binary.shape)


Feature matrix shape: (34435, 2381)
Binary label shape: (34435,)


In [137]:
#Check First Few Rows of Metadata
print(metadata.head())


                                                 sha  \
0  068b20d7efe271ed3b4fa04b3dd82bfc477905130ded4d...   
1  327b6722386117b6376aa57e4096b831160faa318a7305...   
2  a9125e18ae7e219c01137a36e4c7379e1606f5db8fe7de...   
3  28e951721029f698108dc6a7837621bfe82c0c97c081fc...   
4  cf4b96f93e68db9aca8ae80b471defe4f0e85668299e49...   

                   timestamp family category  
0  2020-06-11 00:00:00+00:00    NaN   benign  
1  2020-06-11 00:00:00+00:00    NaN   benign  
2  2020-06-11 00:00:00+00:00    NaN   benign  
3  2020-06-11 00:00:00+00:00    NaN   benign  
4  2020-06-11 00:00:00+00:00    NaN   benign  


In [138]:
# ✅ NEW: Remove classes with only 1 sample
counts = df['category'].value_counts()
valid_classes = counts[counts > 1].index
df = df[df['category'].isin(valid_classes)]

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode category labels to integers
le = LabelEncoder()
y = le.fit_transform(df['category'])
X_features = df.drop(columns=['category'])

# Create train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, stratify=y, random_state=42
)

In [140]:
#checking if sync
print("Number of unique classes in y:", len(np.unique(y)))
print("Number of classes in le.classes_:", len(le.classes_))


Number of unique classes in y: 11
Number of classes in le.classes_: 11


In [141]:
#trying Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=le.classes_))


print(classification_report(
    y_test,
    y_pred,
    labels=range(len(le.classes_)),  # handle missing class in test set
    target_names=le.classes_,
    zero_division=0  # avoids warnings if precision/recall can't be computed
))

                    precision    recall  f1-score   support

          backdoor       0.98      0.98      0.98       533
            benign       0.99      1.00      1.00      3673
        downloader       0.83      0.45      0.59        11
           dropper       0.86      0.70      0.78        27
informationstealer       0.95      0.93      0.94        43
          p2p-worm       0.00      0.00      0.00         3
               pua       0.00      0.00      0.00         2
        ransomware       0.94      0.81      0.87        21
            trojan       0.97      0.95      0.96      1345
  trojan-gamethief       0.00      0.00      0.00         0
              worm       0.97      0.98      0.97      1229

         micro avg       0.98      0.98      0.98      6887
         macro avg       0.68      0.62      0.64      6887
      weighted avg       0.98      0.98      0.98      6887



In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# --- Train Random Forest Model ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# --- Predict ---
y_pred_rf = rf.predict(X_test)

# --- Print Overall Accuracy ---
print("Random Forest - Overall Accuracy:", accuracy_score(y_test, y_pred_rf))

# --- Function to Evaluate Top 10 Classes ---
def limited_accuracy_by_class(y_true, y_pred, class_names, top_n=10):
    counts = pd.Series(y_true).value_counts().nlargest(top_n)
    top_classes = counts.index.tolist()
    cm = confusion_matrix(y_true, y_pred, labels=top_classes)
    output = []

    for i, class_idx in enumerate(top_classes):
        class_name = class_names[class_idx]
        TP = cm[i, i]
        FN = cm[i].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = cm.sum() - (TP + FN + FP)
        acc = (TP + TN) / cm.sum()
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        output.append([class_name, round(acc, 2), round(tpr, 2), round(fpr, 2)])

    return pd.DataFrame(output, columns=["Class", "Accuracy", "TPR", "FPR"])

# --- Display Evaluation for Top 10 Classes ---
top10_rf_report = limited_accuracy_by_class(y_test, y_pred_rf, le.classes_, top_n=10)
print("\nTop 10 Class Evaluation - Random Forest:")
print(top10_rf_report)


Random Forest - Overall Accuracy: 0.9814142587483665

Top 10 Class Evaluation - Random Forest:
                Class  Accuracy   TPR   FPR
0              benign      1.00  1.00  0.01
1              trojan      0.98  0.95  0.01
2                worm      0.99  0.98  0.01
3            backdoor      1.00  0.98  0.00
4  informationstealer      1.00  0.93  0.00
5             dropper      1.00  0.70  0.00
6          ransomware      1.00  0.81  0.00
7          downloader      1.00  0.45  0.00
8            p2p-worm      1.00  0.00  0.00
9                 pua      1.00  0.00  0.00
