In [30]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import clone

In [31]:
# Read data
df = pd.read_csv('dataset_phishing.csv')

In [32]:
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [33]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y = le.fit_transform(df["status"])

# Step 1: Split data into 70% train and 30% temp (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 2: Split the temp set into 50% validation and 50% test (15% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

## Just Decision Tree

In [34]:
class CustomBoostingClassifier:
    def __init__(self, n_estimators=50, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.models = []
        self.model_weights = []

    def fit(self, X, y):
        # Convert class labels to {-1, 1} for binary classification
        y_transformed = np.where(y == 0, -1, 1)
        
        # Initialize with zero predictions
        predictions = np.zeros(len(y))
        
        for i in range(self.n_estimators):
            # Compute residuals as the negative gradient
            residuals = y_transformed - predictions
            
            # Fit a weak learner to the residuals
            model = DecisionTreeRegressor(max_depth=self.max_depth)
            model.fit(X, residuals)
            
            # Predict the residuals
            residual_preds = model.predict(X)
            
            # Update the predictions with the learning rate
            predictions += self.learning_rate * residual_preds
            
            # Store the model and its weight
            self.models.append(model)
            self.model_weights.append(self.learning_rate)
    
    def predict(self, X):
        # Aggregate predictions from all models
        predictions = np.zeros(X.shape[0])
        
        for model, weight in zip(self.models, self.model_weights):
            predictions += weight * model.predict(X)
        
        # Convert predictions back to class labels
        return np.where(predictions >= 0, 1, 0)

# Initialize the custom boosting model
custom_boosting = CustomBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)

# Fit the custom boosting model on the training data
custom_boosting.fit(X_train, y_train)

# Predict on the validation data
y_val_pred = custom_boosting.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy with custom boosting:", val_accuracy)
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Predict on the test data
y_test_pred = custom_boosting.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy with custom boosting:", test_accuracy)
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


Validation Accuracy with custom boosting: 0.941073512252042
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.94      0.94       873
           1       0.93      0.95      0.94       841

    accuracy                           0.94      1714
   macro avg       0.94      0.94      0.94      1714
weighted avg       0.94      0.94      0.94      1714

Test Accuracy with custom boosting: 0.9498542274052478
Test Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95       842
           1       0.95      0.95      0.95       873

    accuracy                           0.95      1715
   macro avg       0.95      0.95      0.95      1715
weighted avg       0.95      0.95      0.95      1715



In [35]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final results for the test set
model_name = "DT Custom Boosting"
print(f"\nFinal Test Results for {model_name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Final Test Results for DT Custom Boosting:
Accuracy: 0.9499
Precision: 0.9518
Recall: 0.9496
F1 Score: 0.9507
Confusion Matrix:
[[800  42]
 [ 44 829]]


## Optimised Custom Mix

In [36]:
class EnhancedMixedBoostingClassifier:
    def __init__(self, n_estimators=50, learning_rate=0.1, max_depth=3, model_sequence=None, min_improvement=1e-3, early_stopping_rounds=10):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.initial_learning_rate = learning_rate
        self.max_depth = max_depth
        self.model_sequence = model_sequence if model_sequence else ['tree', 'logistic', 'knn', 'random_forest']
        self.models = []
        self.model_weights = []
        self.min_improvement = min_improvement
        self.early_stopping_rounds = early_stopping_rounds

    def _get_model(self, model_type):
        """Return a model based on the type specified with optimized parameters."""
        if model_type == 'tree':
            return DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=10, min_samples_leaf=5)
        elif model_type == 'logistic':
            return LogisticRegression(C=1.0, max_iter=200)
        elif model_type == 'knn':
            return KNeighborsRegressor(n_neighbors=5, weights='distance')
        elif model_type == 'random_forest':
            return RandomForestRegressor(n_estimators=50, max_depth=5, random_state=42)
        else:
            raise ValueError(f"Model type {model_type} is not supported")

    def fit(self, X, y, X_val=None, y_val=None):
        # Convert class labels to {-1, 1} for binary classification
        y_transformed = np.where(y == 0, -1, 1)

        # Initialize predictions and setup early stopping
        predictions = np.zeros(len(y))
        no_improvement_count = 0
        best_val_recall = -np.inf

        for i in range(self.n_estimators):
            # Adapt learning rate as we progress
            self.learning_rate = self.initial_learning_rate / np.sqrt(1 + i)

            # Determine the model type for this iteration
            model_type = self.model_sequence[i % len(self.model_sequence)]
            model = self._get_model(model_type)

            # Compute residuals as the negative gradient
            residuals = y_transformed - predictions

            # Fit the model to the residuals
            if model_type == 'logistic':
                # Logistic regression requires binary labels
                model.fit(X, np.where(residuals >= 0, 1, 0))
                residual_preds = model.predict_proba(X)[:, 1] * 2 - 1  # Map to residual space
            else:
                # Decision trees, KNN, and Random Forest directly fit on residuals
                model.fit(X, residuals)
                residual_preds = model.predict(X)

            # Update predictions
            predictions += self.learning_rate * residual_preds

            # Store the model and its weight
            self.models.append(model)
            self.model_weights.append(self.learning_rate)

            # Early stopping check with validation set
            if X_val is not None and y_val is not None:
                y_val_pred = self.predict(X_val)
                val_recall = recall_score(y_val, y_val_pred)
                if val_recall > best_val_recall + self.min_improvement:
                    best_val_recall = val_recall
                    no_improvement_count = 0
                else:
                    no_improvement_count += 1

                # Stop if there's no improvement over several rounds
                if no_improvement_count >= self.early_stopping_rounds:
                    print(f"Early stopping at iteration {i+1} with best validation recall: {best_val_recall}")
                    break

    def predict(self, X):
        # Aggregate predictions from all models
        predictions = np.zeros(X.shape[0])

        for model, weight in zip(self.models, self.model_weights):
            if isinstance(model, LogisticRegression):
                # Logistic regression outputs probabilities; map to residual space
                model_preds = model.predict_proba(X)[:, 1] * 2 - 1
            else:
                # Decision trees, KNN, and Random Forest predict directly
                model_preds = model.predict(X)
            predictions += weight * model_preds

        return np.where(predictions >= 0, 1, 0)


# Define possible sequences and hyperparameters for the search
model_sequences = [
    ['tree', 'logistic', 'knn', 'random_forest'],
    ['tree', 'knn', 'logistic', 'random_forest'],
    ['logistic', 'tree', 'random_forest', 'knn'],
    ['random_forest', 'logistic', 'knn', 'tree'],
    ['random_forest', 'knn', 'tree'],
    ['random_forest', 'tree', 'knn'],
    ['knn', 'tree', 'random_forest'],
    ['knn', 'random_forest', 'tree'],
    ['tree', 'knn', 'random_forest'],
    ['tree', 'random_forest', 'knn'],
    ['logistic', 'knn', 'tree'],
    ['logistic', 'tree', 'knn'],
    ['knn', 'tree', 'logistic'],
    ['knn', 'logistic', 'tree'],
    ['tree', 'knn', 'logistic'],
    ['tree', 'logistic', 'knn'],

]
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_improvement': [1e-3, 1e-4],
    'early_stopping_rounds': [5, 10]
}

# Initialize variables to track the best model and score
best_recall = -np.inf
best_model = None
best_params = {}

# Iterate over all combinations of model sequences and hyperparameters
for sequence in model_sequences:
    for n_estimators in param_grid['n_estimators']:
        for learning_rate in param_grid['learning_rate']:
            for max_depth in param_grid['max_depth']:
                for min_improvement in param_grid['min_improvement']:
                    for early_stopping_rounds in param_grid['early_stopping_rounds']:
                        # Initialize and fit the model with the current parameters
                        model = EnhancedMixedBoostingClassifier(
                            n_estimators=n_estimators,
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            model_sequence=sequence,
                            min_improvement=min_improvement,
                            early_stopping_rounds=early_stopping_rounds
                        )
                        
                        # Train with validation set for early stopping
                        model.fit(X_train, y_train, X_val=X_val, y_val=y_val)
                        
                        # Predict on validation data and calculate recall
                        y_val_pred = model.predict(X_val)
                        recall = recall_score(y_val, y_val_pred)
                        
                        # Update best model if recall has improved
                        if recall > best_recall:
                            best_recall = recall
                            best_model = model
                            best_params = {
                                'model_sequence': sequence,
                                'n_estimators': n_estimators,
                                'learning_rate': learning_rate,
                                'max_depth': max_depth,
                                'min_improvement': min_improvement,
                                'early_stopping_rounds': early_stopping_rounds
                            }
                            print(f"New best recall: {best_recall} with params: {best_params}")

# Final output
print("\nBest Model Configuration:")
print("Model Sequence:", best_params['model_sequence'])
print("Number of Estimators:", best_params['n_estimators'])
print("Learning Rate:", best_params['learning_rate'])
print("Max Depth:", best_params['max_depth'])
print("Min Improvement:", best_params['min_improvement'])
print("Early Stopping Rounds:", best_params['early_stopping_rounds'])
print("Best Validation Recall:", best_recall)

# Evaluate the best model on the test data
y_test_pred = best_model.predict(X_test)
test_recall = recall_score(y_test, y_test_pred)
print("Test Recall with best model:", test_recall)


Early stopping at iteration 16 with best validation recall: 0.9500594530321046
New best recall: 0.9500594530321046 with params: {'model_sequence': ['tree', 'logistic', 'knn', 'random_forest'], 'n_estimators': 50, 'learning_rate': 0.05, 'max_depth': 3, 'min_improvement': 0.001, 'early_stopping_rounds': 5}
Early stopping at iteration 21 with best validation recall: 0.9500594530321046
Early stopping at iteration 16 with best validation recall: 0.9500594530321046
Early stopping at iteration 21 with best validation recall: 0.9500594530321046
Early stopping at iteration 8 with best validation recall: 0.9464922711058263
Early stopping at iteration 13 with best validation recall: 0.9464922711058263
Early stopping at iteration 8 with best validation recall: 0.9464922711058263
Early stopping at iteration 13 with best validation recall: 0.9464922711058263
Early stopping at iteration 13 with best validation recall: 0.9500594530321046
Early stopping at iteration 18 with best validation recall: 0.95

In [37]:
# Compute performance metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Display final results for the test set
model_name = "Optimised Mix Custom Boosting"
print(f"\nFinal Test Results for {model_name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")


Final Test Results for Optimised Mix Custom Boosting:
Accuracy: 0.9691
Precision: 0.9756
Recall: 0.9633
F1 Score: 0.9695
Confusion Matrix:
[[821  21]
 [ 32 841]]
