In [5]:
# ==============================================================
# URL PHISHING DETECTION — CONFERENCE-GRADE NOTEBOOK
# ==============================================================

import pandas as pd
import numpy as np
import re
import joblib
import requests
import tldextract
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [6]:
def count_digits(s): 
    return sum(c.isdigit() for c in s)

def count_letters(s): 
    return sum(c.isalpha() for c in s)

def count_special(s): 
    return sum(not c.isalnum() for c in s)

def get_html(url, timeout=7):
    try:
        r = requests.get(
            url,
            timeout=timeout,
            headers={"User-Agent": "Mozilla/5.0"}
        )
        return r.text, r.status_code, r.history
    except Exception:
        return "", 0, []


In [7]:
def extract_features(url: str, fetch_html=False) -> dict:
    features = {}

    features["URLLength"] = len(url)
    features["NoOfLettersInURL"] = count_letters(url)
    features["NoOfDegitsInURL"] = count_digits(url)

    features["LetterRatioInURL"] = features["NoOfLettersInURL"] / max(1, features["URLLength"])
    features["DegitRatioInURL"] = features["NoOfDegitsInURL"] / max(1, features["URLLength"])

    features["NoOfOtherSpecialCharsInURL"] = count_special(url)
    features["SpacialCharRatioInURL"] = features["NoOfOtherSpecialCharsInURL"] / max(1, features["URLLength"])

    ext = tldextract.extract(url)
    domain_full = f"{ext.domain}.{ext.suffix}"

    features["DomainLength"] = len(domain_full)
    features["TLDLength"] = len(ext.suffix)
    features["IsHTTPS"] = 1 if url.startswith("https") else 0
    features["IsDomainIP"] = 1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", ext.domain) else 0

    soup = None
    history = []

    if fetch_html:
        html, status, history = get_html(url)
        soup = BeautifulSoup(html, "html.parser") if html else None

    try:
        title = soup.title.string.strip() if soup and soup.title else ""
        features["HasTitle"] = 1 if title else 0
        features["Title"] = len(title)
    except:
        features["HasTitle"] = 0
        features["Title"] = 0

    try:
        desc = soup.find("meta", {"name": "description"})
        features["HasDescription"] = 1 if desc and desc.get("content") else 0
    except:
        features["HasDescription"] = 0

    features["NoOfJS"] = len(soup.find_all("script")) if soup else 0
    features["NoOfCSS"] = len(soup.find_all("link", rel="stylesheet")) if soup else 0
    features["NoOfImage"] = len(soup.find_all("img")) if soup else 0
    features["HasPasswordField"] = 1 if soup and soup.find("input", {"type": "password"}) else 0
    features["NoOfURLRedirect"] = len(history)

    # Dataset-only features (kept zero for inference)
    dataset_only = [
        "URLSimilarityIndex","CharContinuationRate","TLDLegitimateProb",
        "URLCharProb","NoOfSubDomain","HasObfuscation",
        "ObfuscationRatio","NoOfEqualsInURL"
    ]
    for f in dataset_only:
        features[f] = 0

    features["NoOfEqualsInURL"] = url.count("=")
    features["NoOfQMarkInURL"] = url.count("?")

    return features


In [8]:
df = pd.read_csv("data/PhiUSIIL_Phishing_URL_Dataset.csv")

DROP_COLS = ["FILENAME", "URL", "Domain", "TLD", "Title"]
df = df.drop(columns=DROP_COLS, errors="ignore")
df = df.fillna(0)

X = df.drop("label", axis=1)
y = df["label"]

X = X.apply(pd.to_numeric, errors="coerce").fillna(0)

print("Dataset Shape:", X.shape)


Dataset Shape: (235795, 50)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [10]:


pipeline = Pipeline([
    ("model", XGBClassifier(
        n_estimators=350,
        max_depth=6,
        learning_rate=0.09,
        subsample=0.85,
        colsample_bytree=0.85,
        eval_metric="logloss",
        tree_method="hist",
        random_state=42
    ))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



XGBoost Accuracy: 1.0

Confusion Matrix:
 [[20189     0]
 [    0 26970]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20189
           1       1.00      1.00      1.00     26970

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for tr, val in skf.split(X, y):
    pipeline.fit(X.iloc[tr], y.iloc[tr])
    preds = pipeline.predict(X.iloc[val])
    cv_scores.append(accuracy_score(y.iloc[val], preds))

print("XGBoost CV Accuracy: %.4f ± %.4f" % (np.mean(cv_scores), np.std(cv_scores)))


XGBoost CV Accuracy: 1.0000 ± 0.0000


In [12]:
from sklearn.pipeline import make_pipeline

lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, n_jobs=-1)
)

lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

print("\nLogistic Regression Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))



Logistic Regression Accuracy: 0.9998727708390763
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20189
           1       1.00      1.00      1.00     26970

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [13]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

print("\nRandom Forest Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))



Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     20189
           1       1.00      1.00      1.00     26970

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159



In [14]:
joblib.dump(pipeline, "urlphishing_model.pkl")
joblib.dump(list(X.columns), "urlfeature_list.pkl")

print("Model and feature list saved successfully.")


Model and feature list saved successfully.


In [15]:
feature_list = joblib.load("urlfeature_list.pkl")

def predict_url(url):
    feats = extract_features(url, fetch_html=True)
    df_test = pd.DataFrame([feats])

    for col in feature_list:
        if col not in df_test:
            df_test[col] = 0

    df_test = df_test[feature_list]
    df_test = df_test.apply(pd.to_numeric, errors="coerce").fillna(0)

    pred = pipeline.predict(df_test)[0]
    proba = pipeline.predict_proba(df_test)[0]

    return ("Phishing" if pred else "Legitimate"), proba
