Imports

In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# Load the dataset

In [99]:
data = pd.read_csv("PhiUSIIL_Phishing_URL_Dataset.csv")

# Data Preprocessing
Drop irrelevant columns (e.g., 'FILENAME', 'URL', 'Domain', 'Title', etc.)

In [100]:
columns_to_drop = ['FILENAME', 'URL', 'Domain', 'TLD', 'Title']
data_cleaned = data.drop(columns=columns_to_drop, axis=1)

Encode categorical variables if any

In [101]:
label_encoder = LabelEncoder()
data_cleaned['IsDomainIP'] = label_encoder.fit_transform(data_cleaned['IsDomainIP'])

Separate features and target

In [102]:
X = data_cleaned.drop('label', axis=1)
y = data_cleaned['label']

Standardize numerical features

In [103]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Split data into training and testing sets

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train and Evaluate Models 1
Model 1: Random Forest

In [105]:
rf_model = RandomForestClassifier(
    n_estimators=100, 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    random_state=42
)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



Fine-tuning of Random Forest with GridSearchCV which was used to find the best parameters.
(Commented out to speed up process)

In [106]:
"""
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring='accuracy', verbose=2)
rf_grid_search.fit(X_train, y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest Cross-Validation Accuracy:", rf_grid_search.best_score_)
"""

'\nrf_param_grid = {\n    \'n_estimators\': [50, 100, 200],\n    \'max_depth\': [None, 10, 20],\n    \'min_samples_split\': [2, 5],\n    \'min_samples_leaf\': [1, 2]\n}\nrf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, scoring=\'accuracy\', verbose=2)\nrf_grid_search.fit(X_train, y_train)\nprint("Best Random Forest Parameters:", rf_grid_search.best_params_)\nprint("Best Random Forest Cross-Validation Accuracy:", rf_grid_search.best_score_)\n'

# Train and Evaluate Models 2
Model 2: Multilayer Perceptron

In [107]:
mlp_model = MLPClassifier(
    hidden_layer_sizes=(50, 50),
    activation='relu',
    alpha=0.001,
    learning_rate_init=0.01,
    max_iter=300,
    random_state=42
)
mlp_model.fit(X_train, y_train)
mlp_predictions = mlp_model.predict(X_test)
print("MLP Classification Report:\n", classification_report(y_test, mlp_predictions))

MLP Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20124
           1       1.00      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



Fine-tuning of MLP with GridSearchCV which was used to find the best parameters
(Commented out to speed up process)

In [108]:
"""
mlp_param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (30, 30, 30)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate_init': [0.001, 0.01]
}
mlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=mlp_param_grid, cv=5, scoring='accuracy', verbose=2)
mlp_grid_search.fit(X_train, y_train)
print("Best MLP Parameters:", mlp_grid_search.best_params_)
print("Best MLP Cross-Validation Accuracy:", mlp_grid_search.best_score_)
"""

'\nmlp_param_grid = {\n    \'hidden_layer_sizes\': [(100,), (50, 50), (30, 30, 30)],\n    \'activation\': [\'relu\', \'tanh\'],\n    \'alpha\': [0.0001, 0.001],\n    \'learning_rate_init\': [0.001, 0.01]\n}\nmlp_grid_search = GridSearchCV(estimator=mlp_model, param_grid=mlp_param_grid, cv=5, scoring=\'accuracy\', verbose=2)\nmlp_grid_search.fit(X_train, y_train)\nprint("Best MLP Parameters:", mlp_grid_search.best_params_)\nprint("Best MLP Cross-Validation Accuracy:", mlp_grid_search.best_score_)\n'

# Comparison and Insights
Output confusion matrices for detailed comparison


In [109]:
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("MLP Confusion Matrix:\n", confusion_matrix(y_test, mlp_predictions))
print("Random Forest Fine-Tuned Cross-Validation Accuracy:", rf_grid_search.best_score_)
print("MLP Fine-Tuned Cross-Validation Accuracy:", mlp_grid_search.best_score_)

Random Forest Confusion Matrix:
 [[20124     0]
 [    0 27035]]
MLP Confusion Matrix:
 [[20118     6]
 [    0 27035]]
Random Forest Fine-Tuned Cross-Validation Accuracy: 1.0
MLP Fine-Tuned Cross-Validation Accuracy: 0.9999416866064677
