Imports

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# Load the dataset

In [62]:
data = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")

# Data Preprocessing
Drop irrelevant columns (e.g., 'FILENAME', 'URL', 'Domain', 'Title', etc.)

In [63]:
columns_to_drop = ['FILENAME', 'URL', 'Domain', 'TLD', 'Title']
data_cleaned = data.drop(columns=columns_to_drop, axis=1)

Encode categorical variables if any

In [64]:
label_encoder = LabelEncoder()
data_cleaned['IsDomainIP'] = label_encoder.fit_transform(data_cleaned['IsDomainIP'])

Separate features and target

In [65]:
X = data_cleaned.drop('label', axis=1)
y = data_cleaned['label']

Standardize numerical features

In [66]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Split data into training and testing sets

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train and Evaluate Models 1
Model 1: Random Forest

In [68]:
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    random_state=42
)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



Fine-tuning of Random Forest with GridSearchCV which was used to find the best parameters.
(Commented out to speed up process)

In [69]:
"""
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='accuracy', verbose=2)
rf_grid_search.fit(X_train, y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest Cross-Validation Accuracy:", rf_grid_search.best_score_)
"""

'\nrf_param_grid = {\n    \'n_estimators\': [50, 100, 200],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5],\n    \'min_samples_leaf\': [1, 2]\n}\nrf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring=\'accuracy\', verbose=2)\nrf_grid_search.fit(X_train, y_train)\nprint("Best Random Forest Parameters:", rf_grid_search.best_params_)\nprint("Best Random Forest Cross-Validation Accuracy:", rf_grid_search.best_score_)\n'

# Train and Evaluate Models 2
Model 2: Multilayer Perceptron

In [70]:
mlp_model = MLPClassifier(
    hidden_layer_sizes=(50, 50),
    activation='relu',
    alpha=0.001,
    learning_rate_init=0.01,
    max_iter=300,
    random_state=42
)
mlp_model.fit(X_train, y_train)
mlp_predictions = mlp_model.predict(X_test)
print("MLP Classification Report:\n", classification_report(y_test, mlp_predictions))

MLP Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



Fine-tuning of MLP with GridSearchCV which was used to find the best parameters
(Commented out to speed up process)

In [71]:
"""
mlp_param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 30, 30)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate_init': [0.001, 0.01]
}
mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=mlp_param_grid, cv=5, scoring='accuracy', verbose=2)
mlp_grid_search.fit(X_train, y_train)
print("Best MLP Parameters:", mlp_grid_search.best_params_)
print("Best MLP Cross-Validation Accuracy:", mlp_grid_search.best_score_)
"""

'\nmlp_param_grid = {\n    \'hidden_layer_sizes\': [(100,), (50, 50), (30, 30, 30)],\n    \'activation\': [\'relu\', \'tanh\'],\n    \'alpha\': [0.0001, 0.001],\n    \'learning_rate_init\': [0.001, 0.01]\n}\nmlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=mlp_param_grid, cv=5, scoring=\'accuracy\', verbose=2)\nmlp_grid_search.fit(X_train, y_train)\nprint("Best MLP Parameters:", mlp_grid_search.best_params_)\nprint("Best MLP Cross-Validation Accuracy:", mlp_grid_search.best_score_)\n'

# Comparison and Insights
Output confusion matrices for detailed comparison


In [None]:
rf_cm = confusion_matrix(y_test, rf_predictions)
mlp_cm = confusion_matrix(y_test, mlp_predictions)

# Display confusion matrices with percentages
rf_total = rf_cm.sum()
mlp_total = mlp_cm.sum()

print("Random Forest Confusion Matrix:")
print(rf_cm)
print(f"Percentages: Legitimate: {rf_cm[0, 0] / rf_total * 100:.2f}%, Phishing: {rf_cm[1, 1] / rf_total * 100:.2f}%")

rf_legitimate_fail_percentage = rf_cm[0, 1] / rf_cm[0, 0] * 100 if rf_cm[0, 0] != 0 else 0
rf_phishing_fail_percentage = rf_cm[1, 0] / rf_cm[1, 1] * 100 if rf_cm[1, 1] != 0 else 0
print(f"Failure Rate for Legitimate URLs in Random Forest: {rf_legitimate_fail_percentage:.2f}%")
print(f"Failure Rate for Phishing URLs in Random Forest: {rf_phishing_fail_percentage:.2f}%")

print("")

print("MLP Confusion Matrix:")
print(mlp_cm)
print(f"Percentages: Legitimate: {mlp_cm[0, 0] / mlp_total * 100:.2f}%, Phishing: {mlp_cm[1, 1] / mlp_total * 100:.2f}%")

mlp_legitimate_fail_percentage = mlp_cm[0, 1] / mlp_cm[0, 0] * 100 if mlp_cm[0, 0] != 0 else 0
mlp_phishing_fail_percentage = mlp_cm[1, 0] / mlp_cm[1, 1] * 100 if mlp_cm[1, 1] != 0 else 0
print(f"Failure Rate for Legitimate URLs in MLP: {mlp_legitimate_fail_percentage:.2f}%")
print(f"Failure Rate for Phishing URLs in MLP: {mlp_phishing_fail_percentage:.2f}%")

# Include results from fine-tuning
#print("Random Forest Fine-Tuned Cross-Validation Accuracy:", rf_grid_search.best_score_)
#print("MLP Fine-Tuned Cross-Validation Accuracy:", mlp_grid_search.best_score_)


Random Forest Confusion Matrix:
[[20124     0]
 [    0 27035]]
Percentages: Legitimate: 42.67%, Phishing: 57.33%
Failure Rate for Legitimate URLs in Random Forest: 0.00%
Failure Rate for Phishing URLs in Random Forest: 0.00%
MLP Confusion Matrix:
[[20118     6]
 [    0 27035]]
Percentages: Legitimate: 42.66%, Phishing: 57.33%
Failure Rate for Legitimate URLs in MLP: 0.03%
Failure Rate for Phishing URLs in MLP: 0.00%
