In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split
import pickle
from tqdm.notebook import tqdm 
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.metrics import confusion_matrix
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import sys
sys.path.append('..')
from utils.preprocess import NDF



## Data Loading and Preprocessing

In [3]:
LEARNING_RATE = 0.001
BATCH_SIZE = 64
EPOCHS = 15


#The whole preprocessing step happens here
#Missing values set to -1
#Outliers are removed
#Encoding of categorical variables
#Min-Max scaling, sigmoid transformation (only if cnn is used)
input_data = {
    'benign': '../feature-extraction/floor/benign_2312.parquet',
    'malign': '../feature-extraction/floor/misp_2402.parquet'
}

dataset = NDF("cnn", True, input_data=input_data, one_line_processing=False)
print(dataset['feature_names'])

2024-05-24 22:48:11,322 - utils.preprocess - INFO - Benign dataset path: ../feature-extraction/floor/benign_2312.parquet
2024-05-24 22:48:11,326 - utils.preprocess - INFO - Malign dataset path: ../feature-extraction/floor/misp_2402.parquet


Malign dataset path: ../feature-extraction/floor/misp_2402.parquet
Benign dataset path: ../feature-extraction/floor/benign_2312.parquet


2024-05-24 22:48:12,362 - utils.preprocess - INFO - Number of records in benign dataset: 462192
2024-05-24 22:48:12,366 - utils.preprocess - INFO - Number of records in malign dataset: 110311
2024-05-24 22:48:13,220 - utils.preprocess - INFO - Total percentage of missing values in benign dataset: 0.19%
2024-05-24 22:48:13,227 - utils.preprocess - INFO - Total percentage of missing values in malign dataset: 0.33%
2024-05-24 22:48:14,903 - utils.preprocess - INFO - Decision tree model saved to models/malware_decision_tree_model.joblib
2024-05-24 22:48:14,990 - utils.preprocess - INFO - New feature 'dtree_prob' created from decision tree predictions.
2024-05-24 22:48:15,091 - utils.preprocess - INFO - Decision Tree Train Accuracy: 0.95
2024-05-24 22:48:15,093 - utils.preprocess - INFO - Decision Tree Test Accuracy: 0.92
2024-05-24 22:48:17,330 - utils.preprocess - INFO - Decision Tree Cross-Validation Scores: [0.91448334 0.91553134 0.91478881]
2024-05-24 22:48:17,335 - utils.preprocess - 


Dataset Subset:
Name: dataset_../feature-extraction/floor/benign2312_../feature-extraction/floor/misp2402_2024-05-24.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0        0.5   0.531209        0.5   0.500000   0.582570   0.517850   
1        0.5   0.531209        0.5   0.520821   0.562177   0.570947   
2        0.5   0.531209        0.5   0.500000   0.500000   0.500000   
3        0.5   0.531209        0.5   0.500000   0.500000   0.500000   
4        0.5   0.531209        0.5   0.520821   0.541570   0.535654   
5        0.5   0.562177        0.5   0.500000   0.541570   0.500000   
6        0.5   0.500000        0.5   0.500000   0.500000   0.500000   
7        0.5   0.622459        0.5   0.500000   0.500000   0.500000   
8        0.5   0.531209        0.5   0.541570   0.562177   0.500000   
9        0.5   0.531209        0.5   0.520821   0.562177   0.517850   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_167  Feature_168  \
0 

## Smote Class Imbalance Handling

In [4]:
from collections import Counter
#imprt SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


x_train, x_test, y_train, y_test = train_test_split(np.array(dataset['features']), np.array(dataset['labels']), test_size=0.2, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

# Check the new class distribution
new_label_counts = Counter(y_train_smote)
print("New class distribution after SMOTE:")
for label, count in new_label_counts.items():
    print(f"Class {label}: {count}")

# Convert the data to PyTorch tensors
x_train = torch.tensor(x_train_smote, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)  # Convert x_test to tensor
y_train = torch.tensor(y_train_smote, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long) 
feature_size = x_train.shape[1]


New class distribution after SMOTE:
Class 0.0: 17427
Class 1.0: 17427


## Data Alignment

In [5]:
import torch.nn.functional as F
import math

# Function to calculate the next perfect square greater than a given number
def next_perfect_square(n):
    next_square = math.ceil(n**0.5)**2
    return next_square

feature_size = x_train.shape[1]
desired_size = next_perfect_square(feature_size)
side_size = int(desired_size ** 0.5)

# Calculate padding required to achieve the desired size
padding = desired_size - feature_size

# Applying dynamic padding
if padding > 0:
    x_train_padded = F.pad(x_train, (0, padding), 'constant', 0)
    x_test_padded = F.pad(x_test, (0, padding), 'constant', 0)
else:
    x_train_padded = x_train
    x_test_padded = x_test

# Reshape the data to the new dynamically calculated square shape
x_train = x_train_padded.view(-1, 1, side_size, side_size)
x_test = x_test_padded.view(-1, 1, side_size, side_size)

## CNN Architecture

In [6]:
class Net(nn.Module):
    def __init__(self, side_size):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Adjust the size calculation based on the number of convolutional layers
        self.fc1 = nn.Linear(128 * (side_size-6)**2, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(128, 2)
        
        # Optionally use dropout
        self.dropout1 = nn.Dropout(0.3)
        self.dropout2 = nn.Dropout(0.3)
        
        print("CNN model created")

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        
        x = torch.flatten(x, 1)
        x = self.dropout1(x)  # Dropout applied after flattening
        
        x = F.relu(self.fc1(x))
        x = self.dropout2(x)  # Dropout applied after first fully connected layer
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return self.fc4(x) # No softmax here, as CrossEntropyLoss applies LogSoftmax internally

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Model's training

In [8]:
LEARNING_RATE = 0.000238
BATCH_SIZE = 64
EPOCHS = 15

model = Net(side_size).to(device)  # Classic fully connected model


#CANNOT RETURN SOFT MAX AS THE LOSS FUNCTION IS CROSS ENTROPY
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Prepare DataLoader
train_data = TensorDataset(x_train, y_train.long())
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

# Function to compute accuracy and F1 score
def compute_metrics(data_loader, model):
    model.eval()
    true_labels = []
    predictions = []

    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)  # Move data to the device
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            true_labels.extend(target.cpu().tolist())  # Move data back to CPU for scoring
            predictions.extend(pred.view_as(target).cpu().tolist())  # Move data back to CPU

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    return accuracy, f1

epoch_losses = []
epoch_accuracies = []
epoch_f1s = []

# Training loop
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)  # Move data to the device
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Compute average loss
    avg_loss = running_loss / len(train_loader)
    epoch_losses.append(avg_loss)

    # Evaluate model and store metrics
    train_accuracy, train_f1 = compute_metrics(train_loader, model)
    epoch_accuracies.append(train_accuracy)
    epoch_f1s.append(train_f1)

    # Enhanced logging
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.4f}, F1 Score: {train_f1:.4f}")

# Save the model
torch.save(model.state_dict(), 'models/phishing_cnn_model_state_dict.pth')
print("Model training complete and saved.")

CNN model created
Epoch 1/3 - Loss: 0.1125, Accuracy: 0.9731, F1 Score: 0.9725
Epoch 2/3 - Loss: 0.0750, Accuracy: 0.9798, F1 Score: 0.9796
Epoch 3/3 - Loss: 0.0666, Accuracy: 0.9838, F1 Score: 0.9838
Model training complete and saved.


In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import numpy as np
from sklearn.metrics import roc_auc_score

def compute_metrics(data_loader, model, criterion):
    """
    Computes a wide range of performance metrics for the given model evaluated on the provided DataLoader.
    
    Parameters:
        data_loader (DataLoader): DataLoader for evaluation data.
        model (torch.nn.Module): The neural network model to evaluate.
        criterion (torch.nn.Module): Loss function used for the model.
    
    Returns:
        dict: A dictionary containing various performance metrics.
    """
    model.eval()  # Switch model to evaluation mode
    true_labels, predictions, probs, losses = [], [], [], []

    with torch.no_grad():
        for data, target in data_loader:
            output = model(data)
            loss = criterion(output, target)
            pred = output.argmax(dim=1, keepdim=True)
            prob = torch.nn.functional.softmax(output, dim=1)[:,1]  # Probability for class 1
            true_labels.extend(target.cpu().numpy())
            predictions.extend(pred.cpu().view_as(target).numpy())
            probs.extend(prob.cpu().numpy())
            losses.append(loss.item())
    
    # Compute metrics
    metrics = {
        'accuracy': accuracy_score(true_labels, predictions),
        'f1': f1_score(true_labels, predictions, average='weighted'),
        'precision': precision_score(true_labels, predictions, average='weighted', zero_division=0),
        'recall': recall_score(true_labels, predictions, average='weighted', zero_division=0),
        'roc_auc_score': roc_auc_score(true_labels, probs) if len(np.unique(true_labels)) > 1 else 0,
        'cross_entropy_loss': np.mean(losses),
    }
    
    return metrics

def evaluate_model(model, x_test, y_test, batch_size, criterion):
    """
    Evaluates the model on the test dataset and prints out a comprehensive set of performance metrics.
    
    Parameters:
        model (torch.nn.Module): The neural network model to evaluate.
        x_test (Tensor): Test dataset features.
        y_test (Tensor): Test dataset labels.
        batch_size (int): Batch size for data loading.
        criterion (torch.nn.Module): Loss function used for the model.
    """
    test_data = TensorDataset(x_test, y_test.long())
    test_loader = DataLoader(test_data, batch_size=batch_size)
    
    metrics = compute_metrics(test_loader, model, criterion)
    
    # Display the metrics
    print("Test Metrics:")
    for metric_name, metric_value in metrics.items():
        if metric_name == 'confusion_matrix':
            print(f"{metric_name}:\n{metric_value}\n")
        else:
            print(f"{metric_name}: {metric_value:.4f}")
    
    return metrics

# Assuming criterion is defined (e.g., nn.CrossEntropyLoss())
criterion = torch.nn.CrossEntropyLoss()

# Example of how to use the evaluate_model function:
# model = Net()  # Assume Net is defined elsewhere and is your trained model
metrics = evaluate_model(model, x_test, y_test, BATCH_SIZE, criterion)

Test Metrics:
accuracy: 0.9793
f1: 0.9794
precision: 0.9795
recall: 0.9793
roc_auc_score: 0.9903
cross_entropy_loss: 0.0765
