# 🪝 Multi‑Source Phishing URL Classifier (URL extractor edition)

Each data source supplies **four pieces**:
1. **path** to the file.
2. **loader** – function that turns the file into a DataFrame.
3. **url_extractor** – function that returns a `pd.Series` of URLs from that DataFrame.
4. **label_extractor** – function that returns a binary phishing label (`1` = phish, `0` = benign).

This makes heterogeneous schemas painless: you explicitly say where the URL and label live for every source.

---

In [5]:
#!pip install pandas scikit-learn tldextract pyarrow tqdm  # uncomment if needed
#!pip install tldextract
#!pip install -q tldextract pandas scikit-learn pyarrow tqdm

import json, ipaddress, pathlib
from typing import Callable, Union, List, Tuple

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from tqdm import tqdm
import tldextract
import base64, binascii
from urllib.parse import urlparse, parse_qs
from sklearn.preprocessing import OneHotEncoder          # <-- one-hot
from sklearn.feature_extraction import FeatureHasher     # <-- hashing trick
from sklearn.ensemble import RandomForestClassifier      # <-- your model

!pip install --quiet m2cgen

pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'tldextract'

In [None]:
# 🔗 DATA SOURCES -----------------------------------------------------------------
# Each tuple: (path, loader, url_extractor, label_extractor)

def load_json(path: Union[str, pathlib.Path]):
    with open(path, 'r', encoding='utf-8') as f:
        return pd.DataFrame(json.load(f))

def phishtank_urls(df: pd.DataFrame):
    return df['url']

def phishtank_label(df: pd.DataFrame):
    # return df['verified'].astype(str).str.lower().isin({'yes','true','1'})
    return True

# Example benign CSV
def load_csv(path):
    return pd.read_csv(path)

def phish_score_url(df: pd.DataFrame):
    return df['URL']  # adjust to your column name

def phish_score_label(df: pd.DataFrame):
    # return df['Score'].astype(int) > 4
    return True

SOURCES: List[Tuple[str, Callable, Callable, Callable]] = [
    ('online-valid.json', load_json, phishtank_urls, phishtank_label),
    ('phish_score.csv', load_csv, phish_score_url, phish_score_label),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_0.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_1.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_2.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_3.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_4.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_5.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_6.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_7.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_8.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_9.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_10.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_11.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_12.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_13.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_14.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_15.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_16.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_17.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_18.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_19.csv', load_csv, lambda df: df['url'], lambda df: False),
    ('/content/drive/MyDrive/Benign Samples/benign_dataframe_20.csv', load_csv, lambda df: df['url'], lambda df: False)
]
print(f'Registered {len(SOURCES)} data source(s).')

Registered 23 data source(s).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 📂 Load & merge --------------------------------------------------------------
frames = []
for path, loader, url_fn, label_fn in SOURCES:
    df = loader(path)
    urls = url_fn(df)
    labels = label_fn(df)
    frame = pd.DataFrame({'url': urls, 'label': labels})
    frames.append(frame)

df_all = pd.concat(frames, ignore_index=True).dropna(subset=['url']).drop_duplicates()
print(f'Combined dataset: {df_all.shape[0]:,} URLs')
df_all.head()

In [None]:
# ✨ Feature engineering -------------------------------------------------------
def url_length(u):
    return len(u)

def num_dashes(u):
    return u.count('-')

def num_dots(u):
    return u.count('.')

def num_subdirs(u):
    return urlparse(u).path.count('/')

def has_https(u):
    return int(u.lower().startswith('https'))

def domain_name(u):
  return tldextract.extract(u).domain

def domain_length(u):
  return len(domain_name(u))

def tld(u):
  return tldextract.extract(u).suffix

def sub_domain(u):
  return tldextract.extract(u).subdomain

def starts_with_tld(u):
  tlds = ['com', 'org', 'net']
  subdomains = sub_domain(u).split('.') + [domain_name(u)]
  return int(any(sd.startswith(tld) for sd in subdomains for tld in tlds))

def num_digits(u):
  return sum(c.isdigit() for c in u)

def num_letters(u):
  return sum(c.isalpha() for c in u)

def params_length(u):
  return len(urlparse(u).query)

def num_params(u):
  return len(parse_qs(urlparse(u).query))

def _looks_like_base64(s: str) -> bool:
    """
    True if the entire string is valid Base-64.
    - Must be non-empty and length a multiple of 4.
    - `validate=True` rejects non-alphabet chars.
    """
    try:
        base64.b64decode(s, validate=True)
        return True
    except Exception:
        return False


def has_b64_param(u: str) -> int:
    """
    Return 1 if **any query-string value** is valid Base-64; else 0.
    Example: https://example.com/?img=aGVsbG8=  → 1
    """
    qs_values = sum(parse_qs(urlparse(u).query, keep_blank_values=True).values(), [])
    return int(any(_looks_like_base64(v) for v in qs_values))

def _looks_url(s):
  indicators = ['com', 'org', 'net', 'http', 'www']
  return any(indicator in s for indicator in indicators)

def has_url_in_params(u: str) -> int:
    qs_values = sum(parse_qs(urlparse(u).query, keep_blank_values=True).values(), [])
    return int(any(_looks_url(v) for v in qs_values))


def uses_ip_address(u):
    host = urlparse(u).hostname or ''
    try:
        ipaddress.ip_address(host)
        return 1
    except ValueError:
        return 0

FEATURES = {
    # ── length / composition ────────────────────────────────────────────────
    "url_length":       url_length,
    "num_dashes":       num_dashes,
    "num_dots":         num_dots,
    "num_subdirs":      num_subdirs,
    "num_digits":       num_digits,
    "num_letters":      num_letters,
    "domain_length":    domain_length,
    "params_length":    params_length,
    "num_params":       num_params,

    # ── boolean / flag features ─────────────────────────────────────────────
    "has_https":         has_https,
    "uses_ip_address":   uses_ip_address,
    "starts_with_tld":   starts_with_tld,
    "has_b64_param":     has_b64_param,
    "has_url_in_params": has_url_in_params,
}

In [None]:
# ---------------------------------------------------------------------
def build_X(df, funcs):
    """numeric feature matrix"""
    return pd.DataFrame({name: [f(u) for u in df['url']] for name, f in funcs.items()})

# ---------------------------------------------------------------------
# assume df_all with columns ['url','label'] already exists
df_all = df_all.reset_index(drop=True)
X_num = build_X(df_all, FEATURES)

# raw categorical columns
cat_df = pd.DataFrame({
    "tld":         [tld(u)         for u in df_all["url"]],
    "domain_name": [domain_name(u) for u in df_all["url"]],
    "sub_domain":  [sub_domain(u)  for u in df_all["url"]],
})

# one-hot for TLD
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_tld = pd.DataFrame(
    ohe.fit_transform(cat_df[["tld"]]),
    index=df_all.index,
    columns=ohe.get_feature_names_out(["tld"]),
)

# 2) hash *domain_name* into 32 dims ------------------------------------------
hasher_dom = FeatureHasher(n_features = 1024, input_type="string")
dom_iter   = [[d] for d in cat_df["domain_name"].astype(str)]
X_dom_hash = hasher_dom.transform(dom_iter).toarray()
X_dom_hash = pd.DataFrame(
    X_dom_hash, index=df_all.index,
    columns=[f"dom_hash_{i}" for i in range(X_dom_hash.shape[1])]
)

# 3) hash *sub_domain* into 32 dims -------------------------------------------
hasher_sub = FeatureHasher(n_features = 1024, input_type="string")
sub_iter   = [[s if s else "EMPTY"] for s in cat_df["sub_domain"].astype(str)]
X_sub_hash = hasher_sub.transform(sub_iter).toarray()
X_sub_hash = pd.DataFrame(
    X_sub_hash, index=df_all.index,
    columns=[f"sub_hash_{i}" for i in range(X_sub_hash.shape[1])]
)

# 4) final matrix -------------------------------------------------------------
X_full = pd.concat([X_num, X_tld, X_dom_hash, X_sub_hash], axis=1)
y      = df_all["label"]
print("Final shape:", X_full.shape)   # (rows,  numeric + one-hot + 64 hashed)


In [None]:
# ⚙️  Algorithms ----------------------------------------------------------------
MODELS = {
    #'LogReg': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200),
    'GradientBoosting': GradientBoostingClassifier()
    #'SVM_rbf': SVC(kernel='rbf', probability=True),
}

In [None]:
# 🧪 Hold-out evaluation ------------------------------------------------------
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    confusion_matrix
)
from sklearn.model_selection import train_test_split  # ← make sure this is imported

def evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    tn, fp, fn, tp = confusion_matrix(y_test, preds, labels=[0, 1]).ravel()
    tpr = tp / (tp + fn) if (tp + fn) else 0
    fpr = fp / (fp + tn) if (fp + tn) else 0

    return {
        "accuracy": accuracy_score(y_test, preds),
        "f1": f1_score(y_test, preds),
        "roc_auc": roc_auc_score(y_test, probs) if probs is not None else np.nan,
        "tpr": tpr,
        "fpr": fpr,
    }

# split and score  (note X_full, not X)
X_tr, X_te, y_tr, y_te = train_test_split(
    X_full, y, test_size=0.30, stratify=y, random_state=42
)

results = {
    name: evaluate(mdl, X_tr, X_te, y_tr, y_te)
    for name, mdl in MODELS.items()
}

pd.DataFrame(results).T


In [None]:
# 📊 5-fold cross-validation ---------------------------------------------------
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rows = []
for fold, (tr, te) in enumerate(skf.split(X_full, y)):
    X_tr, X_te = X_full.iloc[tr], X_full.iloc[te]
    y_tr, y_te = y.iloc[tr], y.iloc[te]

    for name, mdl in MODELS.items():
        rows.append({
            "model": name,
            "fold": fold,
            **evaluate(mdl, X_tr, X_te, y_tr, y_te)   # accuracy, f1, roc_auc, tpr, fpr
        })

cv_df = pd.DataFrame(rows)

metrics = ["accuracy", "f1", "roc_auc", "tpr", "fpr"]
cv_df.groupby("model")[metrics].mean()



In [None]:
# 🌳 Feature importance --------------------------------------------------------
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

rf = RandomForestClassifier(n_estimators=300, random_state=42).fit(X_full, y)

importances = pd.Series(rf.feature_importances_, index=X_full.columns) \
               .sort_values(ascending=False)

importances.head(20)    # show the top 20

## 🚀 Next Steps
* Add more `(path, loader, url_extractor, label_extractor)` tuples to `SOURCES`.
* Engineer more `FEATURES`.
* Try additional algorithms or hyper‑parameter tuning.
* Address class imbalance if needed.


In [None]:
max_depths = [est.tree_.max_depth for est in rf.estimators_]
print("deepest tree:", max(max_depths))


In [None]:
%pip install sklearn-porter
%pip install m2cgen
%pip install skl2onnx==1.14 onnx==1.14.1 onnxmltools==1.11


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Found existing installation: onnx 1.14.1
Uninstalling onnx-1.14.1:
  Successfully uninstalled onnx-1.14.1
Found existing installation: skl2onnx 1.14.0
Uninstalling skl2onnx-1.14.0:
  Successfully uninstalled skl2onnx-1.14.0
Note: you may need to restart the kernel to use updated packages.
Collecting skl2onnx==1.14
  Using cached skl2onnx-1.14.0-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting onnx==1.14.1
  Using cached onnx-1.14.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting onnxmltools==1.11
  Downloading onnxmltools-1.11.0-py2.py3-none-any.whl.metadata (8.3 kB)
Using cached skl2onnx-1.14.0-py2.py3-none-any.whl (294 kB)
Using cached onnx-1.14.1-cp310-cp310-win_amd64.whl (13.3 MB)
Downloading onnxmltools-1.11.0-py2.py3-none-any.whl (302 kB)
Installing collected packages: onnx, skl2onnx, onnxmltools
Successfully installed onnx-1.14.1 onnxmltools-1.11.0 skl2onnx-1.14.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn_porter import Porter
import m2cgen as m2c, sys
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import joblib

sys.setrecursionlimit(20000)                # still raise the limit

rf_shallow = RandomForestClassifier(
    n_estimators=200,        # you can keep 300 if you like
    max_depth=12,           # <- key change: limit depth
    random_state=42,
    n_jobs=-1
).fit(X_tr, y_tr)


# Convert model to JavaScript
porter = Porter(rf_shallow, language='js')
js_code = porter.export(embed_data=True)

# Save to file
with open('rf_shallow.js', 'w') as f:
    f.write(js_code)

# joblib.dump(rf_shallow, "rf_model.pkl")

# model = joblib.load("rf_model.pkl")

# # Set input type (adjust input dimension)
# n_features = X_tr.shape[1]
# initial_type = [('float_input', FloatTensorType([None, n_features]))]

# # Convert to ONNX
# onnx_model = convert_sklearn(model, initial_types=initial_type)

# # Save ONNX model
# with open("rf_model.onnx", "wb") as f:
#     f.write(onnx_model.SerializeToString())

# I'm commenting to try another porter
# js_model_code = m2c.export_to_javascript(rf_shallow,
#                                          function_name="predictRF")

# with open("rf_model.js", "w") as f:
#     f.write(js_model_code)

# print("JS model saved, size ≈", len(js_model_code)//1024, "KB")

# print(list(ohe.get_feature_names_out(['tld'])))




ImportError: cannot import name 'split_complex_to_pairs' from 'onnx.helper' (c:\Users\omer penso\AppData\Local\Programs\Python\Python310\lib\site-packages\onnx\helper.py)