In [6]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score


In [7]:
# Read data
df = pd.read_csv('dataset_phishing.csv')

In [8]:
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [9]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df["status"])

# Step 1: Split data into 70% train and 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Split the temp set into 50% validation and 50% test (15% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
# Define the base estimators to use
base_estimators = [
    DecisionTreeClassifier(),                   # Decision Tree
    LogisticRegression(max_iter=1000),          # Logistic Regression
    SVC(probability=True),                      # Support Vector Classifier
    GaussianNB()                                # Naive Bayes
]

adaboost = AdaBoostClassifier()

# Define the parameter grid to search
param_grid = {
    'estimator': base_estimators,                 # Different base estimators
    'n_estimators': [50, 100, 150],               # Number of estimators
    'learning_rate': [0.01, 0.1, 1, 10]           # Learning rate
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, 
                           scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Training Score:", grid_search.best_score_)

# Evaluate on the validation set
best_adaboost = grid_search.best_estimator_
y_val_pred = best_adaboost.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred))

# Evaluate on the test set
y_test_pred = best_adaboost.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):\n", classification_report(y_test, y_test_pred))


Fitting 5 folds for each of 48 candidates, totalling 240 fits




Best Parameters: {'estimator': GaussianNB(), 'learning_rate': 0.1, 'n_estimators': 50}
Best Training Score: 0.9392571049344159

Validation Accuracy: 0.9381563593932322

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.95      0.93      0.94       873
           1       0.93      0.95      0.94       841

    accuracy                           0.94      1714
   macro avg       0.94      0.94      0.94      1714
weighted avg       0.94      0.94      0.94      1714


Test Accuracy: 0.9381924198250728

Classification Report (Test):
               precision    recall  f1-score   support

           0       0.93      0.95      0.94       842
           1       0.95      0.93      0.94       873

    accuracy                           0.94      1715
   macro avg       0.94      0.94      0.94      1715
weighted avg       0.94      0.94      0.94      1715



In [12]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final results for the test set
model_name = "AdaBoost"
print(f"\nFinal Test Results for {model_name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Final Test Results for AdaBoost:
Accuracy: 0.9382
Precision: 0.9506
Recall: 0.9267
F1 Score: 0.9385
Confusion Matrix:
[[800  42]
 [ 64 809]]
