In [1]:
# malware_classification.ipynb

# Import necessary libraries
import os
import numpy as np
import pandas as pd
from pymongo import MongoClient
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# If you're using Jupyter, you might need this to display plots inline
%matplotlib inline

# For plotting (optional)
import matplotlib.pyplot as plt
import seaborn as sns

# Load environment variables (if needed)
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
# Step 1: Connect to MongoDB and Load Data

# MongoDB Configuration
MONGO_URI = os.getenv('MONGO_URI')
if not MONGO_URI:
    print("Please set the MONGO_URI environment variable.")
    MONGO_URI = input("Enter your MongoDB URI: ")

client = MongoClient(MONGO_URI)
db = client['malware_analysis']
collection = db['dataset1']

# Fetch documents with embeddings and labels
cursor = collection.find({
    'graph_embedding': {'$exists': True},
    'malware_type': {'$exists': True}
})

embeddings = []
labels = []

for doc in cursor:
    embeddings.append(doc['graph_embedding'])
    malware_type_field = doc.get('malware_type', '')
    if malware_type_field:
        if isinstance(malware_type_field, str):
            malware_types = [label.strip() for label in malware_type_field.split(',')]
        elif isinstance(malware_type_field, list):
            malware_types = [label.strip() for label in malware_type_field]
        else:
            malware_types = []
    else:
        malware_types = []
    labels.append(malware_types)

embeddings = np.array(embeddings)
print(f"Total samples loaded: {len(embeddings)}")

# Verify the labels
print("First few labels after processing:")
for l in labels[:10]:
    print(l, type(l))


Total samples loaded: 327
First few labels after processing:
['Cryptocurrency Miner'] <class 'list'>
['Trojan'] <class 'list'>
['Trojan'] <class 'list'>
['Cryptocurrency Miner'] <class 'list'>
['Trojan'] <class 'list'>
['Trojan'] <class 'list'>
['Trojan'] <class 'list'>
['Adware'] <class 'list'>
['Trojan', 'Backdoor'] <class 'list'>
['Trojan'] <class 'list'>


In [3]:
print(embeddings.shape)
# print(labels.shape)
print(len(labels))
print(labels[1])

(327, 128)
327
['Trojan']


In [4]:
print(embeddings)

[[ 1.02958167  0.08374477 -0.76886296 ... -0.24042591  0.06679002
   0.31208497]
 [ 0.18049081 -0.11231075 -1.16224456 ... -0.24667189  0.80205286
   0.69475812]
 [ 0.87774885 -0.13316268 -0.5334838  ...  0.02148508 -0.46653187
  -0.18944445]
 ...
 [ 0.65718579  1.00533879 -0.13156533 ... -0.14751893  0.48862186
   0.16018519]
 [ 1.20110464  0.09410287 -0.41092643 ... -0.72847563  0.46086186
  -0.34656161]
 [ 0.74030542  0.13440715 -0.51151431 ... -0.35390499 -0.57465577
   0.88250464]]


In [5]:
new_embeddings = []
new_labels = []

for emb, lbl_list in zip(embeddings, labels):
    for lbl in lbl_list:
        new_embeddings.append(emb)
        new_labels.append(lbl)

# Convert the list of embeddings back to a NumPy array
new_embeddings = np.array(new_embeddings)
new_labels=np.array(new_labels)
print(new_embeddings)
print(new_labels)


[[ 1.02958167  0.08374477 -0.76886296 ... -0.24042591  0.06679002
   0.31208497]
 [ 0.18049081 -0.11231075 -1.16224456 ... -0.24667189  0.80205286
   0.69475812]
 [ 0.87774885 -0.13316268 -0.5334838  ...  0.02148508 -0.46653187
  -0.18944445]
 ...
 [ 0.65718579  1.00533879 -0.13156533 ... -0.14751893  0.48862186
   0.16018519]
 [ 1.20110464  0.09410287 -0.41092643 ... -0.72847563  0.46086186
  -0.34656161]
 [ 0.74030542  0.13440715 -0.51151431 ... -0.35390499 -0.57465577
   0.88250464]]
['Cryptocurrency Miner' 'Trojan' 'Trojan' 'Cryptocurrency Miner' 'Trojan'
 'Trojan' 'Trojan' 'Adware' 'Trojan' 'Backdoor' 'Trojan' 'Trojan'
 'Dropper' 'Worm' 'Trojan' 'Adware' 'Dropper' 'Unknown' 'Trojan' 'Trojan'
 'Trojan' 'Dropper' 'Worm' 'Trojan' 'Trojan' 'Trojan'
 'Cryptocurrency Miner' 'Unknown' 'Trojan' 'Cryptocurrency Miner' 'Trojan'
 'Trojan' 'Ransomware' 'Trojan' 'Phishing' 'Trojan' 'Ransomware'
 'Cryptocurrency Miner' 'Trojan' 'Trojan' 'Trojan' 'Dropper' 'Worm'
 'Trojan' 'Dropper' 'Worm' 'Troj

In [6]:
import pandas as pd
import numpy as np
# Determine the number of dimensions in your embeddings
embedding_dim = new_embeddings.shape[1]

# Generate column names for each embedding dimension
embedding_columns = [f'embedding_{i}' for i in range(1, embedding_dim + 1)]

# Create a DataFrame for embeddings
df_embeddings = pd.DataFrame(new_embeddings, columns=embedding_columns)

# Add the labels as a new column
df_embeddings['label'] = new_labels

# Display the first few rows of the DataFrame
print(df_embeddings.head())


# Specify the filename
csv_filename = 'graph_embeddings_labels.csv'

# Save the DataFrame to a CSV file without the index
df_embeddings.to_csv(csv_filename, index=False)

print(f"Data successfully saved to '{csv_filename}'")



   embedding_1  embedding_2  embedding_3  embedding_4  embedding_5  \
0     1.029582     0.083745    -0.768863    -0.601737    -0.018552   
1     0.180491    -0.112311    -1.162245    -0.379042    -0.182409   
2     0.877749    -0.133163    -0.533484    -0.071782     0.247684   
3     0.905636    -0.208120    -0.192289    -0.290757    -0.393283   
4     1.212977    -0.007017    -0.812970    -0.364216    -0.810991   

   embedding_6  embedding_7  embedding_8  embedding_9  embedding_10  ...  \
0     0.122026     0.711602    -0.308844     0.562217     -0.516533  ...   
1     0.374964     0.303225    -0.439953    -0.230539     -0.893200  ...   
2    -0.369050     0.087727    -0.638314    -0.449704     -0.745737  ...   
3    -0.258404     0.219378    -0.421130    -0.481209     -0.246315  ...   
4    -0.104662     0.606605    -0.025108    -0.133793      0.130473  ...   

   embedding_120  embedding_121  embedding_122  embedding_123  embedding_124  \
0      -0.432067       0.360033      -0.25

In [7]:
print(new_labels.shape)
print(new_embeddings.shape)

(404,)
(404, 128)


In [8]:
!pip install tensorflow




In [9]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Initialize LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(new_labels)

# Convert integer labels to one-hot encoding
y = to_categorical(integer_encoded)

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:", label_mapping)


2024-11-12 11:36:01.695992: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-12 11:36:01.737252: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-12 11:36:02.000699: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-12 11:36:02.003104: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Label Mapping: {'Adware': 0, 'Backdoor': 1, 'Benign': 2, 'Cryptocurrency Miner': 3, 'Dropper': 4, 'Phishing': 5, 'Ransomware': 6, 'Trojan': 7, 'Unknown': 8, 'Worm': 9}


In [10]:
from sklearn.model_selection import train_test_split

# Feature matrix
X = new_embeddings

# Labels (one-hot encoded)
y = y

# # First, split into training and testing sets (e.g., 80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, 
#     test_size=0.2,        # 20% for testing
#     random_state=42,      # For reproducibility
#     stratify=integer_encoded  # Ensures the same label distribution in both sets
# )

print(f"Training samples: {X.shape}")
print(f"Testing samples: {y.shape}")
print(y)

Training samples: (404, 128)
Testing samples: (404, 10)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [11]:
# import os
# from pymongo import MongoClient
# import numpy as np

# # Assuming Step 1 code is already executed and you have `embeddings` and `labels`

# # Initialize lists to store the processed embeddings and labels
# processed_embeddings = []
# processed_labels = []

# for emb, lbls in zip(embeddings, labels):
#     if lbls:
#         for lbl in lbls:
#             processed_embeddings.append(emb)
#             processed_labels.append(lbl)
#     else:
#         # Optionally handle embeddings without labels
#         # For example, you can skip them or assign a default label
#         # Here, we'll skip embeddings without labels
#         continue

# # Convert the lists to NumPy arrays for efficient processing
# processed_embeddings = np.array(processed_embeddings)
# processed_labels = np.array(processed_labels)

# print(f"Total samples after processing: {len(processed_embeddings)}")

# # Verify the processed labels
# print("First few processed labels:")
# for l in processed_labels[:10]:
#     print(l)


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

print("Classes:")
print(mlb.classes_)


Classes:
['Adware' 'Backdoor' 'Benign' 'Cryptocurrency Miner' 'Dropper' 'Phishing'
 'Ransomware' 'Trojan' 'Unknown' 'Worm']


In [13]:
# Assuming you have the following variables from previous steps:
# embeddings: NumPy array of your graph embeddings (features)
# y: binary matrix of labels from MultiLabelBinarizer (targets)
# mlb: the fitted MultiLabelBinarizer instance

# Step 1: Split Data into Training and Testing Sets using iterative stratification

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

mskf = MultilabelStratifiedKFold(n_splits=5, random_state=42, shuffle=True)
train_indices, test_indices = next(mskf.split(embeddings, y))

X_train, X_test = embeddings[train_indices], embeddings[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Training samples: 263
Testing samples: 64


In [14]:
# pip install iterative-stratification
!pip install scikit-multilearn





In [15]:
from skmultilearn.model_selection import iterative_train_test_split

X_train, y_train, X_test, y_test = iterative_train_test_split(
    embeddings, y, test_size=0.2
)


In [16]:
# Step 2: Feature Scaling (Optional but Recommended)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [17]:
# Step 3: Train the Machine Learning Model for Multi-Label Classification

from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier

# Initialize the base classifier
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Wrap it in OneVsRestClassifier for multi-label classification
classifier = OneVsRestClassifier(base_classifier)

# Train the classifier
classifier.fit(X_train, y_train)


In [18]:
# Step 4: Evaluate the Model

from sklearn.metrics import classification_report, hamming_loss, jaccard_score, f1_score, accuracy_score

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Calculate Hamming Loss
hamming = hamming_loss(y_test, y_pred)
print(f"Hamming Loss: {hamming:.4f}")

# Calculate Jaccard Score (average over samples)
jaccard = jaccard_score(y_test, y_pred, average='samples')
print(f"Jaccard Score: {jaccard:.4f}")

# F1 Score with different averaging methods
f1_micro = f1_score(y_test, y_pred, average='micro')
print(f"F1 Score (Micro): {f1_micro:.4f}")

f1_macro = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score (Macro): {f1_macro:.4f}")

# Exact Match Ratio
exact_match = accuracy_score(y_test, y_pred)
print(f"Exact Match Ratio: {exact_match:.4f}")


Classification Report:
                      precision    recall  f1-score   support

              Adware       0.00      0.00      0.00         2
            Backdoor       0.00      0.00      0.00         0
              Benign       0.00      0.00      0.00         0
Cryptocurrency Miner       1.00      0.14      0.25         7
             Dropper       0.00      0.00      0.00         5
            Phishing       0.00      0.00      0.00         1
          Ransomware       0.00      0.00      0.00         2
              Trojan       0.79      0.89      0.84        46
             Unknown       0.43      0.25      0.32        12
                Worm       0.00      0.00      0.00         5

           micro avg       0.75      0.56      0.64        80
           macro avg       0.22      0.13      0.14        80
        weighted avg       0.61      0.56      0.55        80
         samples avg       0.70      0.62      0.64        80

Hamming Loss: 0.0781
Jaccard Score: 0.6094
F

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [19]:
# Step 5: Save the Model and Encoders for Future Use

import joblib

# Save the trained model
joblib.dump(classifier, 'malware_multilabel_classifier.joblib')

# Save the MultiLabelBinarizer
joblib.dump(mlb, 'mlb.joblib')

# Save the scaler
joblib.dump(scaler, 'scaler.joblib')

print("Model, MultiLabelBinarizer, and scaler have been saved.")


Model, MultiLabelBinarizer, and scaler have been saved.


In [20]:
# Assuming you have a new embedding to classify
# Replace this with the actual embedding you want to classify
new_embedding = embeddings[0]  # Using an existing embedding as an example

# Preprocess the new embedding
new_embedding_scaled = scaler.transform([new_embedding])

# Predict the malware types
prediction = classifier.predict(new_embedding_scaled)
predicted_labels = mlb.inverse_transform(prediction)

print(f"Predicted Malware Types: {predicted_labels[0]}")


Predicted Malware Types: ('Cryptocurrency Miner',)
