In [None]:
from torch.utils.data import TensorDataset, DataLoader
import torch

In [None]:
# GPU CHECK
# -----------------------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
"""
Phishing Link Detection - End-to-End Pipeline
File: phishing_detection_pipeline.py

Usage:
1. Place the dataset CSV in the same folder as this script and name it `phishing_dataset.csv` (or change the path below).
2. Run training & evaluation:
   python phishing_detection_pipeline.py --train
3. Export trained model:
   python phishing_detection_pipeline.py --train --save_model
4. Run a simple Flask API to serve predictions (after training):
   python phishing_detection_pipeline.py --serve

What this pipeline does:
- Loads a processed CSV of URL-based features (41 features + target)
- Basic EDA checks (shape, missing values, class balance)
- Optional feature selection
- Train/test split, scaling where needed
- Trains multiple classifiers (LogisticRegression, RandomForest, XGBoost if available)
- Evaluates using accuracy, precision, recall, F1, ROC-AUC
- Saves the best model to disk (joblib)
- Provides a simple Flask app to serve predictions

Notes:
- The script assumes the CSV already contains numeric/boolean features and a target column named 'label' or 'target'.
- If your CSV column names differ, update the `TARGET_COL` or `FEATURE_EXCLUDE` variables below.

Author: ChatGPT (for Mitra Karn) - forward-thinking, production-ready template
"""

import argparse
import os
import sys
import joblib
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)
import sys

# 🩹 Fix for Jupyter/IPython environments
if 'ipykernel_launcher' in sys.argv[0]:
    sys.argv = ['phishing_detection_pipeline.py', '--train', '--data', 'D:/downloads/Dataset.csv']


# optional: XGBoost if installed
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# CONFIG
DATA_PATH = 'D:/downloads/Dataset.csv'  # change if needed
TARGET_COL = 'Type'  # common names: 'label', 'target', 'class' -- change if needed
MODEL_OUT = 'best_phishing_detector.joblib'
RANDOM_STATE = 42
TEST_SIZE = 0.2


def load_data(path=DATA_PATH):
    df = pd.read_csv(path)
    print(f"Loaded data: {df.shape[0]} rows, {df.shape[1]} columns")
    return df


def basic_eda(df):
    print('\n=== Basic EDA ===')
    print('Columns:', df.columns.tolist())
    print('Head:')
    print(df.head(3).T)
    print('\nMissing values per column:')
    print(df.isna().sum())
    if TARGET_COL in df.columns:
        print('\nTarget value counts:')
        print(df[TARGET_COL].value_counts())
    else:
        print(f"Warning: target column '{TARGET_COL}' not found. Please set TARGET_COL correctly.")


def preprocess(df):
    # Drop obviously non-feature columns
    df_proc = df.copy()

    # If there are textual URL columns or raw url, drop them from features
    drop_candidates = [c for c in df_proc.columns if c.lower() in ('url', 'raw_url', 'link')]
    if drop_candidates:
        print('Dropping URL columns from features:', drop_candidates)
        df_proc = df_proc.drop(columns=drop_candidates)

    # If target col not present, try common alternatives
    if TARGET_COL not in df_proc.columns:
        for alt in ('target', 'class', 'Label'):
            if alt in df_proc.columns:
                print(f"Using '{alt}' as target (renaming to '{TARGET_COL}')")
                df_proc = df_proc.rename(columns={alt: TARGET_COL})
                break

    # split
    if TARGET_COL not in df_proc.columns:
        raise ValueError(f"Target column '{TARGET_COL}' not found in dataset columns.")

    X = df_proc.drop(columns=[TARGET_COL])
    y = df_proc[TARGET_COL]

    # Basic imputation for numeric
    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    non_numeric = [c for c in X.columns if c not in numeric_cols]
    if non_numeric:
        print('Note: non-numeric feature columns detected. Attempting to convert to numeric or drop them:', non_numeric)
        # try to coerce to numeric
        for c in non_numeric:
            try:
                X[c] = pd.to_numeric(X[c], errors='coerce')
                numeric_cols.append(c)
            except Exception:
                print(f"Dropping non-numeric column: {c}")
                X = X.drop(columns=[c])

    # Final numeric X
    X = X[numeric_cols]

    # Imputer
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    return X_imputed, y, imputer


def train_and_evaluate(X_train, X_test, y_train, y_test, save_model=True):
    results = {}

    # Define a standard scaler to help some models
    scaler = StandardScaler()

    # Candidate models
    models = {
        'logreg': Pipeline([('scaler', scaler), ('clf', LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))]),
        'rf': Pipeline([('clf', RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE))])
    }
    if HAS_XGB:
        models['xgb'] = Pipeline([('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE))])

    for name, pipe in models.items():
        print(f"\nTraining {name}...")
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, 'predict_proba') or hasattr(pipe.named_steps['clf'], 'predict_proba') else None

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

        results[name] = {
            'model': pipe,
            'accuracy': acc,
            'precision': prec,
            'recall': rec,
            'f1': f1,
            'roc_auc': roc
        }

        print(f"{name} -> ACC: {acc:.4f}, PREC: {prec:.4f}, REC: {rec:.4f}, F1: {f1:.4f}, ROC_AUC: {roc if roc is not None else 'N/A'}")
        print('Confusion matrix:')
        print(confusion_matrix(y_test, y_pred))
        print('Classification report:')
        print(classification_report(y_test, y_pred, zero_division=0))

    # pick best by f1
    best_name = max(results.keys(), key=lambda k: results[k]['f1'])
    best = results[best_name]['model']
    print(f"\nBest model is: {best_name} with F1={results[best_name]['f1']:.4f}")

    if save_model:
        joblib.dump({'model': best}, MODEL_OUT)
        print(f"Saved best model to {MODEL_OUT}")

    return results, best_name


# Simple Flask app to serve predictions
def run_flask_server(model_path=MODEL_OUT, imputer=None):
    try:
        from flask import Flask, request, jsonify
    except Exception:
        print('Flask is not installed. Install with: pip install flask')
        return

    obj = joblib.load(model_path)
    model = obj['model'] if isinstance(obj, dict) and 'model' in obj else obj

    app = Flask('phishing_api')

    @app.route('/predict', methods=['POST'])
    def predict():
        payload = request.json
        # Expect JSON with 'features': {col: value, ...} or list of values in same order as training
        if payload is None:
            return jsonify({'error': 'no json payload provided'}), 400

        if 'features' in payload and isinstance(payload['features'], dict):
            feature_dict = payload['features']
            df = pd.DataFrame([feature_dict])
        elif 'features' in payload and isinstance(payload['features'], list):
            df = pd.DataFrame(payload['features'])
        else:
            return jsonify({'error': "payload should contain 'features' as dict or list"}), 400

        # Coerce numeric and impute
        df = df.apply(pd.to_numeric, errors='coerce')
        if imputer is not None:
            df = pd.DataFrame(imputer.transform(df), columns=df.columns)

        pred = model.predict(df)
        proba = None
        try:
            proba = model.predict_proba(df)[:, 1].tolist()
        except Exception:
            proba = None

        return jsonify({'prediction': pred.tolist(), 'probability': proba})

    print('Starting Flask server on http://127.0.0.1:5000')
    app.run(debug=True)


def main_train_flow(path=DATA_PATH):
    df = load_data(path)
    basic_eda(df)
    X, y, imputer = preprocess(df)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
    print(f"Train: {X_train.shape}, Test: {X_test.shape}")

    results, best_name = train_and_evaluate(X_train, X_test, y_train, y_test, save_model=True)

    # Save imputer for later use in production
    joblib.dump({'imputer': imputer}, 'imputer.joblib')
    print('Saved imputer to imputer.joblib')

    return results


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--data', type=str, default=DATA_PATH)
    parser.add_argument('--serve', action='store_true')
    parser.add_argument('--save_model', action='store_true')
    args = parser.parse_args()

    if args.train:
        DATA_PATH = args.data
        main_train_flow(DATA_PATH)

    if args.serve:
        # requires that MODEL_OUT and imputer.joblib exist
        imputer_obj = None
        if os.path.exists('imputer.joblib'):
            imputer_obj = joblib.load('imputer.joblib').get('imputer')
        run_flask_server(MODEL_OUT, imputer_obj)

    if not args.train and not args.serve:
        print('Run with --train to train or --serve to run the API. Example:')
        print('  python phishing_detection_pipeline.py --train --data phishing_dataset.csv')


Loaded data: 247950 rows, 42 columns

=== Basic EDA ===
Columns: ['Type', 'url_length', 'number_of_dots_in_url', 'having_repeated_digits_in_url', 'number_of_digits_in_url', 'number_of_special_char_in_url', 'number_of_hyphens_in_url', 'number_of_underline_in_url', 'number_of_slash_in_url', 'number_of_questionmark_in_url', 'number_of_equal_in_url', 'number_of_at_in_url', 'number_of_dollar_in_url', 'number_of_exclamation_in_url', 'number_of_hashtag_in_url', 'number_of_percent_in_url', 'domain_length', 'number_of_dots_in_domain', 'number_of_hyphens_in_domain', 'having_special_characters_in_domain', 'number_of_special_characters_in_domain', 'having_digits_in_domain', 'number_of_digits_in_domain', 'having_repeated_digits_in_domain', 'number_of_subdomains', 'having_dot_in_subdomain', 'having_hyphen_in_subdomain', 'average_subdomain_length', 'average_number_of_dots_in_subdomain', 'average_number_of_hyphens_in_subdomain', 'having_special_characters_in_subdomain', 'number_of_special_characters_i

In [10]:
model, imputer = main_train_flow("D:/downloads/Dataset.csv")
joblib.dump(model, "phishing_model.pkl")
joblib.dump(imputer, "imputer.joblib")
print("✅ Model and imputer saved successfully!")


Loaded data: 247950 rows, 42 columns

=== Basic EDA ===
Columns: ['Type', 'url_length', 'number_of_dots_in_url', 'having_repeated_digits_in_url', 'number_of_digits_in_url', 'number_of_special_char_in_url', 'number_of_hyphens_in_url', 'number_of_underline_in_url', 'number_of_slash_in_url', 'number_of_questionmark_in_url', 'number_of_equal_in_url', 'number_of_at_in_url', 'number_of_dollar_in_url', 'number_of_exclamation_in_url', 'number_of_hashtag_in_url', 'number_of_percent_in_url', 'domain_length', 'number_of_dots_in_domain', 'number_of_hyphens_in_domain', 'having_special_characters_in_domain', 'number_of_special_characters_in_domain', 'having_digits_in_domain', 'number_of_digits_in_domain', 'having_repeated_digits_in_domain', 'number_of_subdomains', 'having_dot_in_subdomain', 'having_hyphen_in_subdomain', 'average_subdomain_length', 'average_number_of_dots_in_subdomain', 'average_number_of_hyphens_in_subdomain', 'having_special_characters_in_subdomain', 'number_of_special_characters_i

In [12]:
pip install flask

Collecting flask
  Using cached flask-3.1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting itsdangerous>=2.2.0 (from flask)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Using cached flask-3.1.2-py3-none-any.whl (103 kB)
Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Installing collected packages: itsdangerous, flask

   -------------------- ------------------- 1/2 [flask]
   -------------------- ------------------- 1/2 [flask]
   -------------------- ------------------- 1/2 [flask]
   ---------------------------------------- 2/2 [flask]

Successfully installed flask-3.1.2 itsdangerous-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install streamlit pyngrok


Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
!pip install streamlit joblib pandas scikit-learn




In [17]:
import joblib
imputer = joblib.load("imputer.joblib")
print("Type of imputer:", type(imputer))
print(imputer)


Type of imputer: <class 'str'>
rf


In [20]:
!streamlit run app.py --server.headless true --browser.serverAddress localhost --server.port 8503



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.



2025-10-22 20:33:19.383 Port 8503 is already in use
