In [None]:
# Import required libraries
import pandas as pd
import tensorflow as tf
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

# Data Preparation

In [5]:
folder_name = '../../data/preprocessed_phishing'
folder = folder_name + '/mlp/'
data_path = folder + 'mlp.csv'
# Load the preprocessed dataset
df = pd.read_csv(data_path)

# Separate features (X) and target (y)
X = df.drop(columns=['label_encoded'])
y = df['label_encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Inspect the encoded target column
processed_df = pd.read_csv(data_path)
print("Unique values in the 'label_encoded' column after encoding:")
print(processed_df['label_encoded'].unique())

Unique values in the 'label_encoded' column after encoding:
[1 0]


In [2]:
# Define the MLP model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Input layer + first hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(32, activation='relu'),  # Second hidden layer
    Dropout(0.2),  # Dropout for regularization
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, 
                    validation_split=0.2, 
                    epochs=20, 
                    batch_size=32, 
                    verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Generate a classification report
y_pred = (model.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20


I0000 00:00:1732069927.686279      62 service.cc:145] XLA service 0x7ecbd0005450 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732069927.686344      62 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0


[1m 113/2000[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 1ms/step - accuracy: 0.8487 - loss: 0.4212

I0000 00:00:1732069930.739858      62 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9801 - loss: 0.0718 - val_accuracy: 0.9999 - val_loss: 0.0010
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9999 - loss: 6.9592e-04 - val_accuracy: 0.9999 - val_loss: 0.0012
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 1.4457e-04 - val_accuracy: 0.9999 - val_loss: 0.0013
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9998 - loss: 9.5260e-04 - val_accuracy: 0.9999 - val_loss: 0.0014
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 7.0525e-05 - val_accuracy: 0.9999 - val_loss: 0.0014
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9999 - loss: 1.6552e-04 - val_accuracy: 0.9999 - val_loss: 0.0016
Epoch 7

### Multi-Layer Perceptron (MLP) Results for Larger Dataset

- **Model Architecture:**
  - Two hidden layers with 64 and 32 neurons.
  - ReLU activation for non-linear transformations.
  - Dropout layers for regularization.
  - Sigmoid activation in the output layer for binary classification.

- **Performance:**
  - Achieved **100% accuracy** on training, validation, and test datasets.
  - Extremely low loss values.
  - Perfect precision, recall, and F1-scores for both classes (`legitimate` and `phishing`).

- **Concerns:**
  - Potential **overfitting**: The model may be memorizing patterns rather than generalizing.
  - Dataset might be simple or features could be highly separable, making classification easy.

- **Next Steps for Validation:**
  - Use **k-fold cross-validation** to assess generalization across multiple data splits.
  - Introduce **feature noise** to test the model's robustness to imperfect data.
  - These steps will help confirm whether the results are reliable or highlight potential limitations.

In [3]:
# Define a function to create the MLP model
def create_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_shape,)),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=Adam(),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Set up K-Fold Cross-Validation
k = 10
kf = KFold(n_splits=k, shuffle=True, random_state=42)
fold_accuracies = []

# Loop through each fold
for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
    print(f"Training Fold {fold+1}/{k}")
    
    # Split the data into training and testing sets for this fold
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Create a new instance of the model for each fold
    model = create_model(X_train.shape[1])
    
    # Train the model
    history = model.fit(X_train, y_train,
                        validation_split=0.2,
                        epochs=10,
                        batch_size=32,
                        verbose=0)  # Set verbose=1 if you want to see detailed training logs
    
    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Fold {fold+1} Accuracy: {accuracy:.4f}")
    fold_accuracies.append(accuracy)

# Calculate the average accuracy across all folds
mean_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Accuracy Across {k} Folds: {mean_accuracy:.4f}")

Training Fold 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 1 Accuracy: 0.9999
Training Fold 2/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 2 Accuracy: 1.0000
Training Fold 3/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 3 Accuracy: 1.0000
Training Fold 4/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 4 Accuracy: 1.0000
Training Fold 5/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 5 Accuracy: 1.0000
Training Fold 6/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 6 Accuracy: 1.0000
Training Fold 7/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 7 Accuracy: 1.0000
Training Fold 8/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 8 Accuracy: 0.9999
Training Fold 9/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 9 Accuracy: 1.0000
Training Fold 10/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Fold 10 Accuracy: 1.0000

Average Accuracy Across 10 Folds: 1.0000


### K-Fold Cross-Validation Results

The MLP model performed exceptionally well during 10-fold cross-validation, achieving an average accuracy of **100%**. Each fold's accuracy was consistently either `1.0000` or `0.9999`, showing that the model's performance is robust and reliable across different splits of the data. This suggests that the model has learned meaningful patterns from the dataset and generalizes well to unseen subsets. 

The slightly lower accuracy in Fold 8 (`0.9999`) could be due to minor differences in the data split or randomness in training, but it does not significantly affect the overall performance. These results indicate that the model is both accurate and consistent. To further validate its robustness, we will now introduce noise into the data and observe how well the model adapts to imperfections.

In [4]:
# Introduce noise into the dataset
noise_factor = 0.05  # Adjust the noise level as needed
X_noisy = X + np.random.normal(0, noise_factor, X.shape)

# Split the noisy dataset into training and testing sets
X_train_noisy, X_test_noisy, y_train_noisy, y_test_noisy = train_test_split(
    X_noisy, y, test_size=0.2, random_state=42, stratify=y
)

# Recreate the model
model_noisy = create_model(X_train_noisy.shape[1])

# Train the model on noisy data
history_noisy = model_noisy.fit(
    X_train_noisy, y_train_noisy,
    validation_split=0.2,
    epochs=20,
    batch_size=32,
    verbose=1
)

# Evaluate the model on noisy test data
loss_noisy, accuracy_noisy = model_noisy.evaluate(X_test_noisy, y_test_noisy, verbose=0)
print(f"Test Accuracy with Noise: {accuracy_noisy:.4f}")

# Generate a classification report for noisy data
y_pred_noisy = (model_noisy.predict(X_test_noisy) > 0.5).astype(int)
print(classification_report(y_test_noisy, y_pred_noisy))

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9660 - loss: 0.0936 - val_accuracy: 0.9999 - val_loss: 9.1327e-04
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9998 - loss: 9.7052e-04 - val_accuracy: 0.9999 - val_loss: 9.6338e-04
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9999 - loss: 7.3495e-04 - val_accuracy: 0.9999 - val_loss: 0.0011
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9999 - loss: 3.2172e-04 - val_accuracy: 0.9999 - val_loss: 0.0011
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 8.4742e-05 - val_accuracy: 0.9999 - val_loss: 0.0013
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 1.0000 - loss: 1.8077e-05 - val_accuracy: 0.9999 - val_loss: 0.0014

### Analysis of Results After Adding Noise
The MLP model performed exceptionally well even after introducing noise into the dataset. With a test accuracy of **99.99%** and perfect precision, recall, and F1-scores, the model demonstrates strong robustness to noisy data. The slight increase in validation loss over epochs indicates the presence of noise, but it did not significantly affect the model’s performance.

This result suggests that the model has learned meaningful and generalizable patterns in the data rather than overfitting to specific details. The features appear to be highly distinguishable even under noisy conditions, further validating the model’s reliability.

In [5]:
# Add noisy labels (flip 5% of the labels)
def add_label_noise(labels, noise_level=0.05):
    np.random.seed(42)  # Ensure reproducibility
    noisy_labels = labels.copy()
    n_noisy = int(noise_level * len(labels))
    noisy_indices = np.random.choice(labels.index, n_noisy, replace=False)
    noisy_labels.loc[noisy_indices] = 1 - noisy_labels.loc[noisy_indices]  # Flip 0 to 1, and 1 to 0
    return noisy_labels

# Introduce noisy labels
y_noisy = add_label_noise(y, noise_level=0.05)

# Split the dataset with noisy labels
X_train, X_test, y_train, y_test = train_test_split(X, y_noisy, test_size=0.2, random_state=42, stratify=y_noisy)

# Increase feature noise (stress testing)
def add_feature_noise(features, noise_level=0.1):
    np.random.seed(42)  # Ensure reproducibility
    noisy_features = features + np.random.normal(0, noise_level, features.shape)
    return noisy_features

X_train_noisy = add_feature_noise(X_train, noise_level=0.1)
X_test_noisy = add_feature_noise(X_test, noise_level=0.1)

# Train and evaluate the model with noisy labels and features
model = create_model(X_train_noisy.shape[1])
history = model.fit(X_train_noisy, y_train,
                    validation_split=0.2,
                    epochs=20,
                    batch_size=32,
                    verbose=1)

# Evaluate the model on the noisy test set
test_loss, test_accuracy = model.evaluate(X_test_noisy, y_test, verbose=0)
y_pred = (model.predict(X_test_noisy) > 0.5).astype(int)

# Display results
print(f"Test Accuracy with Noisy Labels and Features: {test_accuracy:.4f}")
print(classification_report(y_test, y_pred))

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9261 - loss: 0.2743 - val_accuracy: 0.9511 - val_loss: 0.1998
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9497 - loss: 0.2112 - val_accuracy: 0.9513 - val_loss: 0.1974
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9494 - loss: 0.2101 - val_accuracy: 0.9511 - val_loss: 0.1987
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9492 - loss: 0.2076 - val_accuracy: 0.9513 - val_loss: 0.1962
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9503 - loss: 0.2047 - val_accuracy: 0.9514 - val_loss: 0.1958
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9493 - loss: 0.2073 - val_accuracy: 0.9515 - val_loss: 0.1981
Epoch 7/20
[1m2000/2000[0

### Results with Noisy Labels and Features

After introducing noise to both the labels and features, the MLP model achieved a test accuracy of **94.91%**. Precision, recall, and F1-scores for both classes (`legitimate` and `phishing`) were approximately **0.95**, demonstrating that the model remains robust under noisy conditions.

#### Observations:
1. **Performance Impact**:
   - The test accuracy dropped from near-perfect levels (100%) to **94.91%**, reflecting the challenge introduced by noisy data.
   - The increased validation loss suggests that noise in both features and labels affected the model’s ability to perfectly classify data.

2. **Generalization**:
   - Despite the noise, the model retained strong generalization capabilities, with balanced metrics across both classes.

3. **Class Balance**:
   - Precision, recall, and F1-scores were equally high for both classes, indicating that the noise did not skew the model’s predictions.

4. **Robustness**:
   - The model demonstrated excellent robustness, maintaining nearly 95% accuracy even under challenging noisy conditions.

#### Key Takeaway:
The MLP model is highly reliable and robust, even when faced with noisy data. However, the drop in performance highlights the importance of clean data for achieving optimal results.