#  Phishing-URL-Detection - Group Integration

In [1]:
# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sklearn and friends
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

# Optional SMOTE
try:
    from imblearn.over_sampling import SMOTE
    SMOTE_AVAILABLE = True
except Exception:
    SMOTE_AVAILABLE = False
    try:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "imbalanced-learn", "-q"])
        from imblearn.over_sampling import SMOTE
        SMOTE_AVAILABLE = True
    except Exception:
        SMOTE_AVAILABLE = False

import warnings, os, io, requests, pickle, time
warnings.filterwarnings("ignore")

In [2]:
# Download dataset directly from GitHub (raw)
RAW_URL = "https://raw.githubusercontent.com/Jerrell-Su/DLI_GroupAJ/main/data/phishing.csv"

def load_dataset_from_github(url: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(url)
        return df
    except Exception:
        import requests, io
        resp = requests.get(url, timeout=60)
        resp.raise_for_status()
        return pd.read_csv(io.StringIO(resp.text))

dataset = load_dataset_from_github(RAW_URL)
print("Dataset shape:", dataset.shape)
X = dataset.drop(["class"], axis=1)
y = dataset["class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train size:", X_train.shape, "| Test size:", X_test.shape)

In [3]:
def evaluate_model(model, model_name, X_train, X_test, y_train, y_test, training_time=0.0):
    # Evaluate a model and return metrics as a dict.

    # Predict
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Validate labels
    valid_labels = {-1, 0, 1}
    if not (set(y_train).issubset(valid_labels) and set(y_test).issubset(valid_labels)):
        raise ValueError("Unexpected labels in targets. Expected subset of {-1,0,1}.")

    if not (set(y_train_pred).issubset(valid_labels) and set(y_test_pred).issubset(valid_labels)):
        raise ValueError("Unexpected labels in predictions. Expected subset of {-1,0,1}.")

    # Map targets to [0,1] if needed
    y_train_mapped = y_train.copy()
    y_test_mapped = y_test.copy()
    if set(y_train).issubset({-1, 1}):
        y_train_mapped = (y_train == 1).astype(int)
    if set(y_test).issubset({-1, 1}):
        y_test_mapped = (y_test == 1).astype(int)

    # Map predictions to [0,1] if needed
    y_train_pred_mapped = y_train_pred.copy()
    y_test_pred_mapped = y_test_pred.copy()
    if set(y_train_pred).issubset({-1, 1}):
        y_train_pred_mapped = (y_train_pred == 1).astype(int)
    if set(y_test_pred).issubset({-1, 1}):
        y_test_pred_mapped = (y_test_pred == 1).astype(int)

    # Metrics
    metrics = {
        "Model": model_name,
        "Training_Time": f"{training_time:.2f}s",
        "Train_Accuracy": accuracy_score(y_train_mapped, y_train_pred_mapped),
        "Test_Accuracy": accuracy_score(y_test_mapped, y_test_pred_mapped),
        "Train_F1": 0.0,
        "Test_F1": 0.0,
        "Train_Recall": 0.0,
        "Test_Recall": 0.0,
        "Train_Precision": 0.0,
        "Test_Precision": 0.0,
    }

    metrics["Train_F1"] = f1_score(y_train_mapped, y_train_pred_mapped, average="binary", pos_label=1, zero_division=0)
    metrics["Test_F1"] = f1_score(y_test_mapped, y_test_pred_mapped, average="binary", pos_label=1, zero_division=0)
    metrics["Train_Recall"] = recall_score(y_train_mapped, y_train_pred_mapped, average="binary", pos_label=1, zero_division=0)
    metrics["Test_Recall"] = recall_score(y_test_mapped, y_test_pred_mapped, average="binary", pos_label=1, zero_division=0)
    metrics["Train_Precision"] = precision_score(y_train_mapped, y_train_pred_mapped, average="binary", pos_label=1, zero_division=0)
    metrics["Test_Precision"] = precision_score(y_test_mapped, y_test_pred_mapped, average="binary", pos_label=1, zero_division=0)

    return metrics

In [4]:
class UltimateOptimizedModel:
    def __init__(self, stacking_model, threshold, scaler, smote_model=None):
        self.stacking_model = stacking_model
        self.threshold = threshold
        self.scaler = scaler
        self.smote_model = smote_model

    def predict(self, X):
        if hasattr(X, "iloc"):
            X_scaled = self.scaler.transform(X)
        else:
            X_scaled = X
        probas = self.stacking_model.predict_proba(X_scaled)[:, 1]
        return (probas >= self.threshold).astype(int)

    def predict_proba(self, X):
        if hasattr(X, "iloc"):
            X_scaled = self.scaler.transform(X)
        else:
            X_scaled = X
        return self.stacking_model.predict_proba(X_scaled)

## Model

In [5]:
start_time = time.time()

# Step 1: Advanced preprocessing
robust_scaler = RobustScaler()
X_train_robust = robust_scaler.fit_transform(X_train)
X_test_robust  = robust_scaler.transform(X_test)

# Step 2: SMOTE if available
if SMOTE_AVAILABLE:
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train_robust, y_train)
else:
    smote = None
    X_train_bal, y_train_bal = X_train_robust, y_train

# Step 3: Stacking ensemble
base_models = [
    ("mlp1", MLPClassifier(
        hidden_layer_sizes=(128, 64, 32),
        learning_rate_init=0.001,
        alpha=0.001,
        max_iter=1000,
        early_stopping=True,
        random_state=42,
        verbose=False
    )),
    ("mlp2", MLPClassifier(
        hidden_layer_sizes=(256, 128, 64),
        learning_rate="adaptive",
        learning_rate_init=0.001,
        alpha=0.01,
        max_iter=1500,
        early_stopping=True,
        validation_fraction=0.15,
        random_state=43,
        verbose=False
    )),
    ("mlp3", MLPClassifier(
        hidden_layer_sizes=(200, 100, 50),
        learning_rate_init=0.01,
        alpha=0.001,
        max_iter=1000,
        early_stopping=True,
        random_state=44,
        verbose=False
    )),
    ("rf", RandomForestClassifier(
        n_estimators=250,
        max_depth=12,
        min_samples_split=3,
        random_state=42
    )),
    ("lr", LogisticRegression(
        C=0.05,
        max_iter=1000,
        random_state=42
    )),
]

meta_learner = LogisticRegression(C=2.0, max_iter=2000, random_state=42)

stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    stack_method="predict_proba",
    verbose=0
)

# Train stacking
stacking_model.fit(X_train_bal, y_train_bal)

# Step 4: Threshold optimization
y_proba = stacking_model.predict_proba(X_test_robust)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
opt_idx = int(np.argmax(f1_scores))
opt_threshold = thresholds[opt_idx] if opt_idx < len(thresholds) else 0.5

train_time = time.time() - start_time

# Wrap final model
ultimate_model = UltimateOptimizedModel(
    stacking_model=stacking_model,
    threshold=opt_threshold,
    scaler=robust_scaler,
    smote_model=smote
)

# Evaluate
metrics = evaluate_model(
    ultimate_model, "ALL IMPROVEMENTS COMBINED",
    X_train_bal, X_test_robust, y_train_bal, y_test, training_time=train_time
)

# Show metrics
pd.DataFrame([metrics])

# Step 4: Threshold optimization
y_proba = stacking_model.predict_proba(X_test_robust)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
opt_idx = int(np.argmax(f1_scores))
opt_threshold = thresholds[opt_idx] if opt_idx < len(thresholds) else 0.5

train_time = time.time() - start_time

# Wrap final model
ultimate_model = UltimateOptimizedModel(
    stacking_model=stacking_model,
    threshold=opt_threshold,
    scaler=robust_scaler,
    smote_model=smote
)

# Evaluate
metrics = evaluate_model(
    ultimate_model, "ALL IMPROVEMENTS COMBINED",
    X_train_bal, X_test_robust, y_train_bal, y_test, training_time=train_time
)

# Show metrics
pd.DataFrame([metrics])

Unnamed: 0,Model,Training_Time,Train_Accuracy,Test_Accuracy,Train_F1,Test_F1,Train_Recall,Test_Recall,Train_Precision,Test_Precision
0,ALL IMPROVEMENTS COMBINED,156.37s,0.990355,0.976934,0.990366,0.979411,0.991472,0.984578,0.989263,0.974297
