In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold

# Read data
df = pd.read_csv('dataset_phishing.csv')
features = ['shortest_word_path','ratio_intMedia','links_in_tags','nb_hyphens','page_rank','avg_word_path',
 'ratio_extHyperlinks','longest_words_raw','google_index','length_hostname','longest_word_host','domain_registration_length',
 'nb_www','nb_underscore','nb_dots','ratio_extMedia','phish_hints','domain_in_title','web_traffic','safe_anchor',
 'nb_space','shortening_service','ip','domain_age','nb_qm','nb_hyperlinks','nb_slash']

X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df["status"])

# Step 1: Split data into 70% train and 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Split the temp set into 50% validation and 50% test (15% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Traditional Methods

## Traditional Method 1: SVM

In [3]:
# ======================================= METHOD 1: SVM ================================
# Hyperparameter tuning for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

svm_grid_search = GridSearchCV(SVC(probability=True, random_state=42), svm_param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)

# Best parameters for SVM
print("Best parameters for SVM:", svm_grid_search.best_params_)

# Validate and test the best SVM model
svm_model = svm_grid_search.best_estimator_
y_val_pred_svm = svm_model.predict(X_val)

# Compute metrics for validation set
svm_val_accuracy = accuracy_score(y_val, y_val_pred_svm)
svm_val_precision = precision_score(y_val, y_val_pred_svm)
svm_val_recall = recall_score(y_val, y_val_pred_svm)
svm_val_f1_score = f1_score(y_val, y_val_pred_svm)

# Print validation results
print("SVM Validation Results:")
print("Validation Accuracy:", svm_val_accuracy)
print("Validation Precision:", svm_val_precision)
print("Validation Recall:", svm_val_recall)
print("Validation F1 Score:", svm_val_f1_score)

y_pred_svm = svm_model.predict(X_test)
svm_conf_matrix = confusion_matrix(y_test, y_pred_svm)

# Compute metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)
svm_recall = recall_score(y_test, y_pred_svm)
svm_f1_score = f1_score(y_test, y_pred_svm)

# Print SVM results
print("Confusion Matrix:\n", svm_conf_matrix)
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1_score)

Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM Validation Results:
Validation Accuracy: 0.9574095682613769
Validation Precision: 0.951764705882353
Validation Recall: 0.9619500594530321
Validation F1 Score: 0.9568302779420461
Confusion Matrix:
 [[813  29]
 [ 38 835]]
Accuracy: 0.960932944606414
Precision: 0.9664351851851852
Recall: 0.9564719358533792
F1 Score: 0.9614277489925158


## Traditional Method 2: Decision Tree

In [3]:
# ================================ METHOD 2: Traditional Tree ==================

# Hyperparameter tuning for Decision Tree
dt_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Validate and test the best Decision Tree model
dt_model = dt_grid_search.best_estimator_
y_val_pred_dt = dt_model.predict(X_val)

# Compute metrics for validation set
dt_val_accuracy = accuracy_score(y_val, y_val_pred_dt)
dt_val_precision = precision_score(y_val, y_val_pred_dt)
dt_val_recall = recall_score(y_val, y_val_pred_dt)
dt_val_f1_score = f1_score(y_val, y_val_pred_dt)

# Print validation results
print("Decision Tree Validation Results:")
print("Validation Accuracy:", dt_val_accuracy)
print("Validation Precision:", dt_val_precision)
print("Validation Recall:", dt_val_recall)
print("Validation F1 Score:", dt_val_f1_score)

y_pred_dt = dt_model.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, y_pred_dt)

# Compute metrics for Decision Tree
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1_score = f1_score(y_test, y_pred_dt)

# Print Decision Tree results
print("Decision Tree Test Results:")
print("Confusion Matrix:\n", dt_conf_matrix)
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1_score)

Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Validation Results:
Validation Accuracy: 0.9323220536756126
Validation Precision: 0.9239766081871345
Validation Recall: 0.93935790725327
Validation F1 Score: 0.9316037735849056
Decision Tree Test Results:
Confusion Matrix:
 [[796  46]
 [ 42 831]]
Accuracy: 0.9486880466472303
Precision: 0.9475484606613455
Recall: 0.9518900343642611
F1 Score: 0.9497142857142857


## Traditional Method 3: KNN

In [4]:
# Define KNN model and hyperparameter grid
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': range(1, 21),           # Test k values from 1 to 20
    'weights': ['uniform', 'distance'],    # Uniform or distance-weighted voting
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Different distance metrics
    'p': [1, 2]                            # Power parameter for Minkowski (p=1 is Manhattan, p=2 is Euclidean)
}

# Perform GridSearchCV with validation set
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_knn = grid_search.best_estimator_
print(best_knn)

# Evaluate on the validation set
y_val_pred = best_knn.predict(X_val)
y_val_pred_prob = best_knn.predict_proba(X_val)[:, 1]

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Display validation results
print("\nValidation Results:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"Confusion Matrix:\n{val_conf_matrix}")

# Final evaluation on the test set
y_test_pred = best_knn.predict(X_test)

# Calculate test metrics
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_accuracy = accuracy_score(y_test, y_pred_svm)
test_precision = precision_score(y_test, y_pred_svm)
test_recall = recall_score(y_test, y_pred_svm)
test_f1 = f1_score(y_test, y_pred_svm)

# Display final test results
print("\nFinal Test Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix:\n{test_conf_matrix}")

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
KNeighborsClassifier(metric='manhattan', n_neighbors=16, p=1,
                     weights='distance')

Validation Results:
Accuracy: 0.9516
Precision: 0.9397
Recall: 0.9631
F1 Score: 0.9513
Confusion Matrix:
[[821  52]
 [ 31 810]]

Final Test Results:
Accuracy: 0.9609
Precision: 0.9664
Recall: 0.9565
F1 Score: 0.9614
Confusion Matrix:
[[813  29]
 [ 33 840]]


## Traditional Method 4: Logistic Regression

In [5]:
# Define the Logistic Regression model and a hyperparameter grid for tuning
logreg = LogisticRegression(max_iter=1000)  # Default Logistic Regression
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['liblinear', 'lbfgs']  # Solvers
}

# Perform GridSearchCV with validation set
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_logreg = grid_search.best_estimator_
print(best_logreg)

# Evaluate on the validation set
y_val_pred = best_logreg.predict(X_val)
y_val_pred_prob = best_logreg.predict_proba(X_val)[:, 1]

# Calculate validation metrics
val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)

# Display validation results
print("\nValidation Results:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print(f"Confusion Matrix:\n{val_conf_matrix}")

# Final evaluation on the test set
y_test_pred = best_logreg.predict(X_test)
y_test_pred_prob = best_logreg.predict_proba(X_test)[:, 1]

# Calculate test metrics
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final test results
print("\nFinal Test Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"Confusion Matrix:\n{test_conf_matrix}")


Fitting 5 folds for each of 8 candidates, totalling 40 fits
LogisticRegression(C=100, max_iter=1000, solver='liblinear')

Validation Results:
Accuracy: 0.9347
Precision: 0.9224
Recall: 0.9465
F1 Score: 0.9343
Confusion Matrix:
[[806  67]
 [ 45 796]]

Final Test Results:
Accuracy: 0.9388
Precision: 0.9486
Recall: 0.9301
F1 Score: 0.9393
Confusion Matrix:
[[798  44]
 [ 61 812]]


## Traditional Method 5: Naives Bayes

In [6]:
# Define the models and hyperparameter grids
models = {
    'GaussianNB': (GaussianNB(), {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}),
    'BernoulliNB': (BernoulliNB(), {'alpha': [0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]})
}

# Initialize results dictionary
results = {}

# Perform GridSearchCV and evaluate model on validation set function
def grid_search_model(model, param_grid, X_train, X_val, y_train, y_val):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='recall', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    print(best_model)
    y_val_pred = best_model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred)
    recall = recall_score(y_val, y_val_pred)
    f1 = f1_score(y_val, y_val_pred)
    conf_matrix = confusion_matrix(y_val, y_val_pred)

    return {
        'best_model': best_model,
        'best_params': grid_search.best_params_,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix
    }

# Perform GridSearchCV and validate each model
for model_name, (model, param_grid) in models.items():
    print(f"Performing Grid Search on {model_name} with validation set...")
    results[model_name] = grid_search_model(model, param_grid, X_train, X_val, y_train, y_val)

# Test the best models on the test set
for model_name, metrics in results.items():
    best_model = metrics['best_model']
    y_test_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)
    conf_matrix = confusion_matrix(y_test, y_test_pred)

    # Display final results for the test set
    print(f"\nFinal Test Results for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")

Performing Grid Search on GaussianNB with validation set...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
GaussianNB()
Performing Grid Search on BernoulliNB with validation set...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
BernoulliNB(alpha=0.5)

Final Test Results for GaussianNB:
Accuracy: 0.8880
Precision: 0.9316
Recall: 0.8419
F1 Score: 0.8845
Confusion Matrix:
[[788  54]
 [138 735]]

Final Test Results for BernoulliNB:
Accuracy: 0.9213
Precision: 0.9222
Recall: 0.9233
F1 Score: 0.9227
Confusion Matrix:
[[774  68]
 [ 67 806]]
