In [2]:
# malware_classification.ipynb

# Import necessary libraries
import os
import numpy as np
import pandas as pd
from pymongo import MongoClient
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# If you're using Jupyter, you might need this to display plots inline
%matplotlib inline

# For plotting (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables (if needed)
from dotenv import load_dotenv
load_dotenv()


True

In [5]:
# Step 1: Connect to MongoDB and Load Data

# MongoDB Configuration
MONGO_URI = os.getenv('MONGO_URI')
if not MONGO_URI:
    print("Please set the MONGO_URI environment variable.")
    MONGO_URI = input("Enter your MongoDB URI: ")

client = MongoClient(MONGO_URI)
db = client['cuckoo']
collection = db['malware_analysis']

# Fetch documents with embeddings and labels
cursor = collection.find({
    'graph_embedding': {'$exists': True},
    'malware_type': {'$exists': True}
})

embeddings = []
labels = []

for doc in cursor:
    embeddings.append(doc['graph_embedding'])
    labels.append(doc['malware_type'])

embeddings = np.array(embeddings)
labels = np.array(labels)

print(f"Total samples loaded: {len(embeddings)}")


Total samples loaded: 107


In [6]:
# Step 2: Encode Labels

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Display the mapping of labels
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Mapping:")
for label, index in label_mapping.items():
    print(f"{label}: {index}")


Label Mapping:
Cryptocurrency Miner: 0
Dropper, Adware, Trojan: 1
Phishing: 2
Phishing, Trojan: 3
Ransomware: 4
Trojan: 5
Trojan, Virus: 6
Trojan, Worm, Virus, Dropper: 7
Unknown: 8
Virus, Adware, Trojan: 9
Virus, Backdoor, Trojan, Worm: 10
Virus, Dropper, Trojan: 11
Virus, Dropper, Trojan, Worm: 12
Virus, Trojan: 13


In [11]:
# Step 3: Split Data into Training and Testing Sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 85
Testing samples: 22


In [12]:
# Step 4: Feature Scaling (Optional but Recommended)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [13]:
# Step 5: Train the Machine Learning Model

from sklearn.ensemble import RandomForestClassifier

# Initialize the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)


In [16]:
# # Step 6: Evaluate the Model

# from sklearn.metrics import classification_report, confusion_matrix

# # Make predictions on the test set
# y_pred = clf.predict(X_test)

# # Generate classification report
# print("Classification Report:")
# print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# # Generate confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:")
# print(conf_matrix)

# # Plot confusion matrix as a heatmap (optional)
# plt.figure(figsize=(10, 7))
# sns.heatmap(conf_matrix, annot=True, fmt='d',
#             xticklabels=label_encoder.classes_,
#             yticklabels=label_encoder.classes_,
#             cmap='Blues')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()


Classification Report:


ValueError: Number of classes, 5, does not match size of target_names, 14. Try specifying the labels parameter

In [17]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

# Now, mlb.classes_ contains the list of unique labels
print("Classes:")
print(mlb.classes_)


Classes:
[' ' ',' 'A' 'B' 'C' 'D' 'M' 'P' 'R' 'T' 'U' 'V' 'W' 'a' 'c' 'd' 'e' 'g'
 'h' 'i' 'j' 'k' 'm' 'n' 'o' 'p' 'r' 's' 't' 'u' 'w' 'y']
