In [1]:
#hello team, this is our source code file for our project

In [2]:
import numpy as np
import pandas as pd

# Load features and labels
data = np.load("bodmas_train_test.npz")   # tested with bodmas_train_test.npz and bodmas_holdout.npz
X = data['X']  # feature vectors
y_binary = data['y']  # 0=benign, 1=malware

# Load metadata and category
metadata = pd.read_csv("bodmas_metadata_train_test.csv")  # has 'sha' #tested with bodmas_metadata_train_test.csv and bodmas_metadata_holdout.csv
categories = pd.read_csv("bodmas_malware_category.csv")   # has 'sha256'

# Merge on correct columns
metadata = metadata.merge(categories, left_on='sha', right_on='sha256', how='left')
metadata['category'] = metadata['category'].fillna('benign')
metadata.drop(columns=['sha256'], inplace=True)

# Combine with features
df = pd.DataFrame(X)
df['category'] = metadata['category']



In [3]:
#Check Shape of Features and Labels

print("Feature matrix shape:", X.shape)
print("Binary label shape:", y_binary.shape)


Feature matrix shape: (100000, 2381)
Binary label shape: (100000,)


In [4]:
#Check First Few Rows of Metadata
print(metadata.head())


                                                 sha  \
0  e6d7b4bab32def853ab564410df53fa33172dda1bfd48c...   
1  5af37a058a5bcf2284c183ee98d92b7c66d8f5ce623e92...   
2  5bfbbea150af5cef2d3a93b80ef7c7faea9f564b56045d...   
3  216f592f1e1717d5681b7f5f2b14a28a2f0c603b5b7318...   
4  a1ca76813d2e9e7e23b830c87fbe29bcb51fcbe096e445...   

                   timestamp family category  
0  2007-01-01 08:46:39+00:00    NaN   benign  
1  2007-01-26 17:16:30+00:00    NaN   benign  
2  2007-03-21 02:08:53+00:00    NaN   benign  
3  2007-04-25 12:55:06+00:00    NaN   benign  
4  2007-11-14 15:03:55+00:00    NaN   benign  


In [5]:
# ✅ NEW: Remove classes with only 1 sample
counts = df['category'].value_counts()
valid_classes = counts[counts > 1].index
df = df[df['category'].isin(valid_classes)]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode category labels to integers
le = LabelEncoder()
y = le.fit_transform(df['category'])
X_features = df.drop(columns=['category'])

# Create train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.2, stratify=y, random_state=42
)

In [7]:
#checking if sync
print("Number of unique classes in y:", len(np.unique(y)))
print("Number of classes in le.classes_:", len(le.classes_))


Number of unique classes in y: 15
Number of classes in le.classes_: 15


In [8]:
#trying Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=le.classes_))


print(classification_report(
    y_test,
    y_pred,
    labels=range(len(le.classes_)),  # handle missing class in test set
    target_names=le.classes_,
    zero_division=0  # avoids warnings if precision/recall can't be computed
))

                    precision    recall  f1-score   support

          backdoor       0.92      0.96      0.94       933
            benign       0.99      1.00      0.99     11756
       cryptominer       0.67      0.50      0.57         4
        downloader       0.93      0.81      0.87       195
           dropper       0.77      0.76      0.76       116
           exploit       1.00      1.00      1.00         2
informationstealer       0.93      0.80      0.86        46
          p2p-worm       0.00      0.00      0.00         1
               pua       0.00      0.00      0.00         4
        ransomware       0.99      0.86      0.92       143
           rootkit       1.00      1.00      1.00         1
            trojan       0.95      0.93      0.94      4650
  trojan-gamethief       0.00      0.00      0.00         1
             virus       0.94      0.76      0.84        38
              worm       0.94      0.95      0.94      2110

          accuracy                    

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# --- Train Random Forest Model ---
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# --- Predict ---
y_pred_rf = rf.predict(X_test)

# --- Print Overall Accuracy ---
print("Random Forest - Overall Accuracy:", accuracy_score(y_test, y_pred_rf))

# --- Function to Evaluate Top 10 Classes ---
def limited_accuracy_by_class(y_true, y_pred, class_names, top_n=10):
    counts = pd.Series(y_true).value_counts().nlargest(top_n)
    top_classes = counts.index.tolist()
    cm = confusion_matrix(y_true, y_pred, labels=top_classes)
    output = []

    for i, class_idx in enumerate(top_classes):
        class_name = class_names[class_idx]
        TP = cm[i, i]
        FN = cm[i].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = cm.sum() - (TP + FN + FP)
        acc = (TP + TN) / cm.sum()
        tpr = TP / (TP + FN) if (TP + FN) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        output.append([class_name, round(acc, 2), round(tpr, 2), round(fpr, 2)])

    return pd.DataFrame(output, columns=["Class", "Accuracy", "TPR", "FPR"])

# --- Display Evaluation for Top 10 Classes ---
top10_rf_report = limited_accuracy_by_class(y_test, y_pred_rf, le.classes_, top_n=10)
print("\nTop 10 Class Evaluation - Random Forest:")
print(top10_rf_report)


Random Forest - Overall Accuracy: 0.9701

Top 10 Class Evaluation - Random Forest:
                Class  Accuracy   TPR   FPR
0              benign      0.99  1.00  0.02
1              trojan      0.97  0.93  0.01
2                worm      0.99  0.95  0.01
3            backdoor      0.99  0.96  0.00
4          downloader      1.00  0.81  0.00
5          ransomware      1.00  0.86  0.00
6             dropper      1.00  0.76  0.00
7  informationstealer      1.00  0.80  0.00
8               virus      1.00  0.76  0.00
9                 pua      1.00  0.00  0.00


In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

# Scale features to non-negative values (required by MultinomialNB)
scaler_nb = MinMaxScaler()
X_train_nb = scaler_nb.fit_transform(X_train)
X_test_nb = scaler_nb.transform(X_test)

# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_nb, y_train)




In [11]:
# Predict
y_pred_nb = nb.predict(X_test_nb)

# Evaluate
print("Multinomial Naive Bayes - Overall Accuracy:", accuracy_score(y_test, y_pred_nb))



Multinomial Naive Bayes - Overall Accuracy: 0.6586


In [12]:
# Top 10 class evaluation
top10_nb_report = limited_accuracy_by_class(y_test, y_pred_nb, le.classes_, top_n=10)
print("\nTop 10 Class Evaluation - Multinomial Naive Bayes:")
print(top10_nb_report)


Top 10 Class Evaluation - Multinomial Naive Bayes:
                Class  Accuracy   TPR   FPR
0              benign      0.79  0.76  0.18
1              trojan      0.84  0.44  0.04
2                worm      0.88  0.56  0.08
3            backdoor      0.91  0.84  0.09
4          downloader      0.94  0.66  0.05
5          ransomware      0.99  0.00  0.00
6             dropper      0.97  0.67  0.03
7  informationstealer      1.00  0.09  0.00
8               virus      1.00  0.29  0.00
9                 pua      1.00  0.00  0.00
