In [2]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)

In [3]:
# Read data
df = pd.read_csv('dataset_phishing.csv')

In [4]:
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [5]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df["status"])

# Step 1: Split data into 70% train and 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Split the temp set into 50% validation and 50% test (15% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### TRADITIONAL METHOD 1: RANDOM FOREST MODEL

In [6]:
# Hyperparameter tuning for Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Validate and test the best Random Forest model
rf_model = rf_grid_search.best_estimator_
y_val_pred_rf = rf_model.predict(X_val)

# Compute metrics for validation set
rf_val_accuracy = accuracy_score(y_val, y_val_pred_rf)
rf_val_precision = precision_score(y_val, y_val_pred_rf)
rf_val_recall = recall_score(y_val, y_val_pred_rf)
rf_val_f1_score = f1_score(y_val, y_val_pred_rf)

# Print validation results
print("Random Forest Validation Results:")
print("Validation Accuracy:", rf_val_accuracy)
print("Validation Precision:", rf_val_precision)
print("Validation Recall:", rf_val_recall)
print("Validation F1 Score:", rf_val_f1_score)

y_pred_rf = rf_model.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)

# Compute metrics for Random Forest
rf_tn, rf_fp, rf_fn, rf_tp = rf_conf_matrix.ravel()
rf_accuracy = (rf_tp + rf_tn) / (rf_tp + rf_tn + rf_fp + rf_fn)
rf_precision = rf_tp / (rf_tp + rf_fp)
rf_recall = rf_tp / (rf_tp + rf_fn)
rf_f1_score = (2 * (rf_precision * rf_recall)) / (rf_precision + rf_recall)

# Print Random Forest results
print("Random Forest Test Results:")
print("Confusion Matrix:\n", rf_conf_matrix)
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1_score)

Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Validation Results:
Validation Accuracy: 0.9574095682613769
Validation Precision: 0.9507042253521126
Validation Recall: 0.9631391200951248
Validation F1 Score: 0.9568812758417011
Random Forest Test Results:
Confusion Matrix:
 [[812  30]
 [ 25 848]]
Accuracy: 0.967930029154519
Precision: 0.9658314350797267
Recall: 0.9713631156930126
F1 Score: 0.9685893774985722


### Traditional Method 2: SVM Model

In [7]:
from sklearn.svm import SVC  # Import SVM classifier

# Hyperparameter tuning for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

svm_grid_search = GridSearchCV(SVC(probability=True, random_state=42), svm_param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)

# Best parameters for SVM
print("Best parameters for SVM:", svm_grid_search.best_params_)

# Validate and test the best SVM model
svm_model = svm_grid_search.best_estimator_
y_val_pred_svm = svm_model.predict(X_val)

# Compute metrics for validation set
svm_val_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_val_precision = precision_score(y_val, y_val_pred_svm)
svm_val_recall = recall_score(y_val, y_val_pred_svm)
svm_val_f1_score = f1_score(y_val, y_val_pred_svm)

# Print validation results
print("SVM Validation Results:")
print("Validation Accuracy:", svm_val_accuracy)
print("Validation Precision:", svm_val_precision)
print("Validation Recall:", svm_val_recall)
print("Validation F1 Score:", svm_val_f1_score)

y_pred_svm = svm_model.predict(X_test)
svm_conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Compute metrics for SVM
svm_tn, svm_fp, svm_fn, svm_tp = svm_conf_matrix.ravel()
svm_accuracy = (svm_tp + svm_tn) / (svm_tp + svm_tn + svm_fp + svm_fn)
svm_precision = svm_tp / (svm_tp + svm_fp)
svm_recall = svm_tp / (svm_tp + svm_fn)
svm_f1_score = (2 * (svm_precision * svm_recall)) / (svm_precision + svm_recall)

# Print SVM results
print("Confusion Matrix:\n", svm_conf_matrix)
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1_score)

Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM Validation Results:
Validation Accuracy: 0.9574095682613769
Validation Precision: 0.951764705882353
Validation Recall: 0.9619500594530321
Validation F1 Score: 0.9568302779420461
Confusion Matrix:
 [[813  29]
 [ 38 835]]
Accuracy: 0.960932944606414
Precision: 0.9664351851851852
Recall: 0.9564719358533792
F1 Score: 0.9614277489925158


### Traditional Method 3: Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
# Hyperparameter tuning for Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Validate and test the best Decision Tree model
dt_model = dt_grid_search.best_estimator_
y_val_pred_dt = dt_model.predict(X_val)

# Compute metrics for validation set
dt_val_accuracy = accuracy_score(y_val, y_val_pred_dt)
dt_val_precision = precision_score(y_val, y_val_pred_dt)
dt_val_recall = recall_score(y_val, y_val_pred_dt)
dt_val_f1_score = f1_score(y_val, y_val_pred_dt)

# Print validation results
print("Decision Tree Validation Results:")
print("Validation Accuracy:", dt_val_accuracy)
print("Validation Precision:", dt_val_precision)
print("Validation Recall:", dt_val_recall)
print("Validation F1 Score:", dt_val_f1_score)

y_pred_dt = dt_model.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, y_pred_dt)

# Compute metrics for Decision Tree
dt_tn, dt_fp, dt_fn, dt_tp = dt_conf_matrix.ravel()
dt_accuracy = (dt_tp + dt_tn) / (dt_tp + dt_tn + dt_fp + dt_fn)
dt_precision = dt_tp / (dt_tp + dt_fp)
dt_recall = dt_tp / (dt_tp + dt_fn)
dt_f1_score = (2 * (dt_precision * dt_recall)) / (dt_precision + dt_recall)

# Print Decision Tree results
print("Decision Tree Test Results:")
print("Confusion Matrix:\n", dt_conf_matrix)
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1_score)

Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Validation Results:
Validation Accuracy: 0.9323220536756126
Validation Precision: 0.9239766081871345
Validation Recall: 0.93935790725327
Validation F1 Score: 0.9316037735849056
Decision Tree Test Results:
Confusion Matrix:
 [[796  46]
 [ 42 831]]
Accuracy: 0.9486880466472303
Precision: 0.9475484606613455
Recall: 0.9518900343642611
F1 Score: 0.9497142857142856
