In [None]:
import imaplib
import email
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import smtplib
from email.message import EmailMessage
from email.utils import parsedate_to_datetime
from email.header import decode_header
import os
import pickle
import torch
import torch.nn as nn


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [None]:
import torch.nn as nn

class PhishingDetector(nn.Module):
    def __init__(self, input_dim):
        super(PhishingDetector, self).__init__()

#Layer 1: 256 neurons
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(0.4)

#Layer 2: 128 neurons
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(0.3)

#Layer 3: 64 neurons
        self.fc3 = nn.Linear(128, 64)
        self.bn3 = nn.BatchNorm1d(64)
        self.dropout3 = nn.Dropout(0.3)

#Layer 4: 32 neurons
        self.fc4 = nn.Linear(64, 32)
        self.bn4 = nn.BatchNorm1d(32)
        self.dropout4 = nn.Dropout(0.2)

#Output layer
        self.fc5 = nn.Linear(32, 1)

        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Layer 1
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)

#Layer 2
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)

#Layer 3
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.dropout3(x)

#Layer 4
        x = self.fc4(x)
        x = self.bn4(x)
        x = self.relu(x)
        x = self.dropout4(x)

#Output
        x = self.fc5(x)
        x = self.sigmoid(x)

        return x

In [None]:
EMAIL = os.environ.get("PHISHING_EMAIL")
APP_PASSWORD = os.environ.get("PHISHING_APP_PASSWORD")
N = 5


##The Helper functions

In [None]:


def get_header(msg, name):
    return msg.get(name, "")

def clean_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style"]):
        tag.decompose()
    return " ".join(soup.get_text(separator=" ").split())

def extract_links(text):
    return re.findall(r'https?://[^\s]+', text)

def decode_mime_header(value):
    if not value:
        return ""

    decoded_parts = decode_header(value)
    decoded_string = ""

    for part, encoding in decoded_parts:
        if isinstance(part, bytes):
            try:
                decoded_string += part.decode(encoding or "utf-8", errors="ignore")
            except Exception:
                decoded_string += part.decode("utf-8", errors="ignore")
        else:
            decoded_string += part

    return decoded_string.strip()



##Read Email

In [None]:


mail = imaplib.IMAP4_SSL("imap.gmail.com")
mail.login(EMAIL, APP_PASSWORD)
mail.select("inbox")

status, messages = mail.search(None, 'UNSEEN')
email_ids = messages[0].split()[-N:]

emails_data = []

for e_id in email_ids:
    _, msg_data = mail.fetch(e_id, "(RFC822)")
    msg = email.message_from_bytes(msg_data[0][1])

    subject = decode_mime_header(get_header(msg, "Subject"))
    sender  = decode_mime_header(get_header(msg, "From"))
    to      = decode_mime_header(get_header(msg, "To"))
    cc      = decode_mime_header(get_header(msg, "Cc"))
    bcc     = decode_mime_header(get_header(msg, "Bcc"))

    body = ""

    if msg.is_multipart():
        for part in msg.walk():
            payload = part.get_payload(decode=True)
            if payload:
                text = payload.decode(errors="ignore")
                if part.get_content_type() == "text/plain":
                    body += text
                elif part.get_content_type() == "text/html":
                    body += clean_html(text)
    else:
        payload = msg.get_payload(decode=True)
        if payload:
            body = clean_html(payload.decode(errors="ignore"))

    urls = extract_links(body)

    emails_data.append({
        "subject": subject,
        "from": sender,
        "to": to,
        "cc": cc,
        "bcc": bcc,
        "body": body,
        "date": get_header(msg, "Date"),
        "url": " ".join(urls)
    })

    mail.store(e_id, '+FLAGS', '\\Seen')

df = pd.DataFrame(emails_data)


##Keyword lists

In [None]:


URGENT_KEYWORDS = [
    "urgent" , "urgently" , "immediately" , "immediate" , "immediate action" ,
    "respond immediately" , "important" , "high importance" , "high priority" ,
    "priority" , "action required" , "act now" , "respond now" , "asap" ,
    "verify now" , "confirm now" , "update now" , "review now" ,
    "expire" , "expires" , "expires today" , "expiring soon" , "last warning" ,
    "limited time" , "within 24 hours" , "within 48 hours" , "within hours" ,
    "time is running out" , "offer expires" , "today only" ,
    "urgent response needed" , "critical update" , "attention required" ,
    "final reminder" , "final warning" , "immediate response needed" ,
    "deadline" , "due today" , "due now" , "time-sensitive" , "time sensitive" ,
    "respond asap" , "quickly" , "fast action required" , "take action now" ,
    "do this immediately" , "important notice" , "emergency action" ,
    "critical issue" , "must act now" , "respond without delay" ,
    "urgent attention" , "requires immediate action"
]

THREAT_KEYWORDS = [
    "suspend" , "suspended" , "locked" , "locked out" , "close your account" ,
    "account closure" , "legal action" , "legal notice" , "final notice" ,
    "termination" , "deactivate" , "deactivation" , "restricted" , "restriction" ,
    "blocked" , "frozen" , "compromised" , "breach" , "security breach" ,
    "hacked" , "unauthorized" , "unauthorized access" , "unusual activity" ,
    "violation" , "policy violation" , "will be closed" , "forced closure" ,
    "take action or" , "lose access" , "permanently removed" ,
    "account disabled" , "security suspension" , "threat detected" ,
    "security risk" , "security warning" , "final attempt" , "failure to respond" ,
    "account will be terminated" , "forced deactivation" , "breach detected" ,
    "critical security issue" , "account compromised" , "service interruption" ,
    "security alert" , "high-risk login" , "multiple failed attempts"
]

MONEY_PRIZE_KEYWORDS = [
    "win" , "winner" , "won" , "prize" , "grand prize" , "cash" , "cash prize" ,
    "reward" , "bonus" , "lottery" , "jackpot" , "million" , "million dollars" ,
    "refund" , "tax refund" , "tax rebate" , "rebate" , "unclaimed" ,
    "compensation" , "inheritance" , "beneficiary" , "grant" , "funds" ,
    "payment released" , "claim your" , "claim now" , "redeem now" ,
    "congratulations you" , "free money" , "you've been selected" ,
    "$$$" , "financial award" , "monetary reward" , "payout" , "transfer" ,
    "urgent refund" , "deposit available" , "funds available" ,
    "cash transfer" , "unexpected funds" , "reward waiting" , "prize awaiting" ,
    "selected as winner" , "lucky draw" , "special payout" ,
    "exclusive reward" , "free bonus" , "instant winnings"
]

URGENCY_PHRASES = [
    "act now" , "don't wait" , "hurry" , "last chance" , "time is running out" ,
    "offer expires" , "today only" , "now or never" , "don't miss out" ,
    "limited offer" , "limited-time" , "while supplies last" ,
    "urgent response needed" , "expires soon" , "limited time" , "dont miss" ,
    "final opportunity" , "respond quickly" , "immediate attention required" ,
    "before it‚Äôs too late" , "claim before expiry" ,
    "offer ends soon" , "only hours left" , "only today" ,
    "limited quantity" , "final hours" , "respond before deadline"
]

GENERIC_SUSPICIOUS = [
    "click here" , "click below" , "click link" , "open link" , "login" ,
    "log in" , "update" , "update account" , "verify" , "verify your" ,
    "confirm" , "confirm your" , "validate" , "review" , "reactivate" ,
    "re-activate" , "password" , "security alert" , "dear customer" ,
    "dear user" , "valued customer" , "account holder" , "update payment" ,
    "billing information" , "payment method" , "credit card" , "card details" ,
    "reset password" , "identity verification" , "authentication required" ,
    "verify immediately" , "important update" , "account review" ,
    "confirm identity" , "provide details" , "submit information" ,
    "resolve issue" , "confirm account" , "login required" ,
    "reset your account" , "unlock account" , "secure login" ,
    "update credentials" , "verification process" , "restore access" ,
    "protect your account" , "account verification"
]

COMPANY_IMPERSONATION = [
    "paypal" , "amazon" , "apple" , "microsoft" , "google" , "facebook" ,
    "instagram" , "twitter" , "linkedin" , "bank" , "irs" , "fedex" , "usps" ,
    "ups" , "dhl" , "netflix" , "ebay" , "ssa" , "social security" ,
    "wells fargo" , "chase" , "bank of america" , "citibank" , "hsbc" ,
    "capital one" , "barclays" , "royal mail" , "revolut" , "stripe" ,
    "coinbase" , "binance" , "mcdonalds" , "spotify" , "icloud" ,
    "adobe" , "dropbox" , "onedrive" , "office365" , "outlook" , "teams" ,
    "zoho" , "intuit" , "quickbooks" ,
    "amazon support" , "microsoft support" , "google security" ,
    "bank security" , "visa" , "mastercard" , "amex" ,
    "national lottery" , "telecom" , "azure" , "aws" , "gmail team" ,
    "facebook security"
]

SPOOFING_INDICATORS = [
    "noreply" , "no-reply" , "donotreply" , "do-not-reply" ,
    "support@" , "admin@" , "security@" , "verification@" , "alert@" ,
    "service@" , "info@" , "notification@" , "update@" , "mailer@" , "robot@" ,
    "support-team@" , "supportdesk@" , "helpdesk@" , "accounts@" ,
    "billing@" , "customerservice@" , "compliance@" , "system@" , "system-mail@" ,
    "alerts@" , "noreplymail@" , "auto-mailer@" , "webmaster@" , "it-support@"
]

CREDENTIAL_REQUESTS = [
    "enter your password" , "provide your password" , "confirm password" ,
    "password" , "username and password" , "username" , "user id" ,
    "social security number" , "ssn" , "account number" , "credit card" ,
    "credit card number" , "card details" , "cvv" , "pin number" , "pin" ,
    "date of birth" , "dob" , "mother's maiden name" , "security question" ,
    "full name and address" , "bank details" , "routing number" ,
    "id card" , "passport number" , "driver license" , "two-factor code" ,
    "otp" , "one-time password" , "authentication code" , "verify identity" ,
    "enter your credentials" , "provide login details" , "reauthenticate" ,
    "input verification code" , "enter 2fa code" , "banking password" ,
    "enter digits" , "identity confirmation" , "submit credentials"
]

FREE_EMAIL_DOMAINS = [
    "gmail.com" , "yahoo.com" , "hotmail.com" , "outlook.com" , "aol.com" ,
    "mail.com" , "protonmail.com" , "yandex.com" , "gmx.com" , "icloud.com" ,
    "live.com" , "msn.com" , "inbox.com" , "fastmail.com" , "zoho.com" ,
    "hushmail.com" , "tutanota.com" , "yahoo.co.uk" , "outlook.co.uk" ,
    "hotmail.co.uk" , "googlemail.com" , "mail.ru" , "gmx.net" ,
    "yandex.ru" , "usa.com" , "europe.com"
]

personal_pronouns = [
    " i " , " me " , " myself " , " my " , " mine " ,
    " you " , " your " , " yours " , " yourself " , " yourselves " ,
    " we " , " us " , " our " , " ours " , " ourselves " ,
    " he " , " him " , " his " , " himself " ,
    " she " , " her " , " hers " , " herself " ,
    " they " , " them " , " their " , " theirs " , " themselves " ,
    " i'm " , " you're " , " we're " , " they're " ,
    " i'd " , " you'd " , " we'd " , " they'd " ,
    " i'll " , " you'll " , " we'll " , " they'll " ,
    " i've " , " you've " , " we've " , " they've " ,
    " u " , " ur " , " im " , " id " , " youll " , " weve " ,
    " ya " , " u r " , " u're "
]

action_words = [
    "click here" , "verify now" , "update now" , "confirm now" , "act now" ,
    "login" , "log in" , "sign in" , "sign-in" , "reset password" ,
    "unlock account" , "reactivate account" , "open attachment" ,
    "open file" , "download" , "run file" , "install" , "enable macros" ,
    "enable content" , "open this link" , "visit link" , "review document" ,
    "access portal" , "complete form" , "submit info" , "submit information" ,
    "provide details" , "authorize" , "approve request" , "authenticate" ,
    "verify identity" , "take action" , "respond now" , "reply now" ,
    "urgent action required" , "redeem now" , "claim reward" , "claim prize" ,
    "claim bonus" , "authenticate now" , "fix account" , "resolve issue" ,
    "update details" , "check status" , "check your account" , "security check" ,
    "identity check" , "invoice due" , "payment required" , "confirm payment" ,
    "review invoice" , "download invoice" , "open invoice" , "final warning" ,
    "last notice" , "immediate attention" , "time-sensitive" , "expires today" ,
    "expires soon" , "within 24 hours" , "within 48 hours" , "verify details" ,
    "confirm identity" , "update credentials" , "reactivate immediately" ,
    "urgent verification" , "review activity" ,
    "review payment" , "open secure message" , "open secure file" ,
    "download secure document" , "complete verification" ,
    "follow instructions" , "press the button" , "tap to verify" ,
    "accept request" , "approve payment" , "review balance"
]

attachment_keywords = [
    "attached" , "attachment" , "find attached" , "see attached" , "enclosed" ,
    "included" , "attached file" , "attached document" , "attached invoice" ,
    "file" , "document" , "invoice" , "statement" , "report" , "form" , "pdf" ,
    "doc" , "docx" , "xls" , "xlsx" , "ppt" , "pptx" , "zip" , "rar" , "7z" , "tar" ,
    "gz" , "invoice.pdf" , "statement.pdf" , "payment.docx" , "document.pdf" ,
    "remittance.pdf" , "receipt.pdf" , "scan.pdf" , "scanned document" ,
    "secure document" , "protected file" , "important file" ,
    "download attachment" , "download file" , "open document" , "open report" ,
    "review attachment" , "package attached" ,
    "secure pdf" , "encrypted file" , "password-protected file" ,
    "confidential document" , "urgent document" , "delivery note" ,
    "shipping label" , "invoice copy" , "wire details" , "account statement"
]

common_misspellings = [
    "urgnet" , "accout" , "pasword" , "verifiy" , "confirim" , "securty" ,
    "recieve" , "bussiness" , "offical" , "adress" , "addres" , "identiy" ,
    "identitiy" , "authenticaion" , "updatte" , "verfication" , "verificatoin" ,
    "verifiction" , "logn" , "loggin" , "loggin in" , "passwrod" , "paswrod" ,
    "pssword" , "confrim" , "confim" , "confurm" , "securrity" , "securitty" ,
    "acount" , "acoount" , "accuont" , "accoubt" , "acc0unt" , "passw0rd" ,
    "ver1fy" , "conf1rm" , "secur1ty" , "verlfy" , "veriry" , "authentlcate" ,
    "l0gin" , "resp0nd" , "paymnet" , "invocie" , "docuemnt" , "statment" ,
    "recipt" , "notcie" , "activatoin" , "restircted" , "suspened" , "prizee" ,
    "reawrd" , "bannk" , "accuont" , "micorsoft" , "microsofft" ,
    "protction" , "verificaiton" , "confurmation" , "identificatoin" ,
    "autheticatoin" ,
    "passowrd" , "passworld" , "loggin" , "verifcation" , "immediatly" ,
    "urjent" , "supension" , "restrction" , "invoie" , "documant" ,
    "statemant" , "bankk" , "paymnent" , "invocie" , "accpunt" ,
    "vaccant" , "authentcation"
]


##parse date text function

In [None]:

def parse_date_text(date_str):
    month_map = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
        'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }

    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

    date_str = str(date_str).strip().lower()

    # 1 - Original logic (PRIMARY)
    weekday = day = month = year = hour = minute = second = None

    # Weekday
    if len(date_str) >= 3:
        weekday_str = date_str[:3].title()
        if weekday_str in weekdays:
            weekday = weekdays.index(weekday_str)

    # Date
    match = re.search(r'(\d{1,2})\s+([a-z]{3})\s+(\d{4})', date_str)
    if match:
        day = int(match.group(1))
        month = month_map.get(match.group(2))
        year = int(match.group(3))

    # Time
    match_time = re.search(r'(\d{2}):(\d{2})(?::(\d{2}))?', date_str)
    if match_time:
        hour = int(match_time.group(1))
        minute = int(match_time.group(2))
        second = int(match_time.group(3)) if match_time.group(3) else 0

    #  2- Fallback (parsedate_to_datetime)
    if None in [weekday, day, month, year, hour, minute, second]:
        try:
            dt = parsedate_to_datetime(date_str)
            if dt:
                weekday = weekday if weekday is not None else dt.weekday()
                day = day if day is not None else dt.day
                month = month if month is not None else dt.month
                year = year if year is not None else dt.year
                hour = hour if hour is not None else dt.hour
                minute = minute if minute is not None else dt.minute
                second = second if second is not None else dt.second
        except Exception:
            pass

    # 3- Safe defaults
    weekday = 0 if weekday is None else weekday
    day = 1 if day is None else day
    month = 1 if month is None else month
    year = 2025 if year is None else year
    hour = 0 if hour is None else hour
    minute = 0 if minute is None else minute
    second = 0 if second is None else second

    # Cyclical Encoding
    def cyclical(val, max_val):
        return (
            np.sin(2 * np.pi * val / max_val),
            np.cos(2 * np.pi * val / max_val)
        )

    weekday_sin, weekday_cos = cyclical(weekday, 7)
    day_sin, day_cos = cyclical(day, 31)
    month_sin, month_cos = cyclical(month, 12)
    hour_sin, hour_cos = cyclical(hour, 24)
    minute_sin, minute_cos = cyclical(minute, 60)
    second_sin, second_cos = cyclical(second, 60)

    return pd.Series([
        weekday_sin, weekday_cos,
        day_sin, day_cos,
        month_sin, month_cos,
        hour_sin, hour_cos,
        minute_sin, minute_cos,
        second_sin, second_cos,
        year
    ], index=[
        'weekday_sin', 'weekday_cos',
        'day_sin', 'day_cos',
        'month_sin', 'month_cos',
        'hour_sin', 'hour_cos',
        'minute_sin', 'minute_cos',
        'second_sin', 'second_cos',
        'year'
    ])


##Email Alert

In [None]:


def send_alert_email(subject, body):
    msg = EmailMessage()
    msg["From"] = EMAIL
    msg["To"] = EMAIL
    msg["Subject"] = subject
    msg.set_content(body)

    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
        server.login(EMAIL, APP_PASSWORD)
        server.send_message(msg)


##FEATURE EXTRACTOR

In [None]:

def extract_features(df):
    features_df = df.copy()

    subject_lower = features_df["subject"].astype(str).str.lower()
    body_lower = features_df["body"].astype(str).str.lower()
    from_lower = features_df["from"].astype(str).str.lower()

    url_col = "url" if "url" in features_df.columns else "urls"
    urls_lower = features_df[url_col].fillna("").astype(str).str.lower()
    url_str_col = features_df[url_col].fillna("").astype(str)

    features_df["has_cc"] = (~features_df["cc"].isna()).astype(int)
    features_df["has_bcc"] = (~features_df["bcc"].isna()).astype(int)
    features_df["has_url"] = (~features_df[url_col].isna()).astype(int)

    features_df["cc_count"] = features_df["cc"].fillna("").astype(str).apply(
        lambda x: len([e for e in re.split(r"[,;]", x) if "@" in e])
    )
    features_df["bcc_count"] = features_df["bcc"].fillna("").astype(str).apply(
        lambda x: len([e for e in re.split(r"[,;]", x) if "@" in e])
    )

    features_df["total_recipients"] = features_df["cc_count"] + features_df["bcc_count"] + 1
    features_df["is_mass_email"] = (features_df["total_recipients"] > 5).astype(int)

    features_df["subject_length"] = features_df["subject"].astype(str).apply(len)
    features_df["subject_word_count"] = features_df["subject"].astype(str).apply(lambda x: len(x.split()))

    features_df["subject_urgent_keywords"] = subject_lower.apply(
        lambda x: sum(1 for kw in URGENT_KEYWORDS if kw in x)
    )
    features_df["subject_threat_keywords"] = subject_lower.apply(
        lambda x: sum(1 for kw in THREAT_KEYWORDS if kw in x)
    )
    features_df["subject_money_keywords"] = subject_lower.apply(
        lambda x: sum(1 for kw in MONEY_PRIZE_KEYWORDS if kw in x)
    )
    features_df["subject_generic_keywords"] = subject_lower.apply(
        lambda x: sum(1 for kw in GENERIC_SUSPICIOUS if kw in x)
    )
    features_df["subject_company_keywords"] = subject_lower.apply(
        lambda x: sum(1 for kw in COMPANY_IMPERSONATION if kw in x)
    )

    features_df["subject_excessive_punctuation"] = features_df["subject"].astype(str).apply(
        lambda x: int(bool(re.search(r"!{2,}|\?{2,}|\${2,}", x)))
    )
    features_df["subject_exclamation_count"] = features_df["subject"].astype(str).apply(lambda x: x.count("!"))
    features_df["subject_question_count"] = features_df["subject"].astype(str).apply(lambda x: x.count("?"))
    features_df["subject_dollar_count"] = features_df["subject"].astype(str).apply(lambda x: x.count("$"))

    features_df["subject_all_caps_ratio"] = features_df["subject"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0
    )
    features_df["subject_has_re_fwd"] = subject_lower.apply(
        lambda x: int(bool(re.match(r"^(re:|fw:|fwd:)", x)))
    )

    features_df["sender_is_freemail"] = from_lower.apply(
        lambda x: int(any(dom in x for dom in FREE_EMAIL_DOMAINS))
    )
    features_df["sender_has_numbers"] = features_df["from"].astype(str).apply(
        lambda x: int(bool(re.search(r"\d", x)))
    )
    features_df["sender_number_count"] = features_df["from"].astype(str).apply(
        lambda x: len(re.findall(r"\d", x))
    )
    features_df["sender_length"] = features_df["from"].astype(str).apply(len)

    features_df["sender_has_spoofing_indicator"] = from_lower.apply(
        lambda x: int(any(indicator in x for indicator in SPOOFING_INDICATORS))
    )

    features_df["sender_multiple_separators"] = features_df["from"].astype(str).apply(
        lambda x: int(bool(re.search(r"\.{2,}|-{2,}|_{2,}", x)))
    )

    features_df["body_length"] = features_df["body"].astype(str).apply(len)
    features_df["body_word_count"] = features_df["body"].astype(str).apply(lambda x: len(x.split()))

    features_df["body_urgent_keywords"] = body_lower.apply(
        lambda x: sum(1 for kw in URGENT_KEYWORDS if kw in x)
    )
    features_df["body_threat_keywords"] = body_lower.apply(
        lambda x: sum(1 for kw in THREAT_KEYWORDS if kw in x)
    )
    features_df["body_money_keywords"] = body_lower.apply(
        lambda x: sum(1 for kw in MONEY_PRIZE_KEYWORDS if kw in x)
    )
    features_df["body_generic_keywords"] = body_lower.apply(
        lambda x: sum(1 for kw in GENERIC_SUSPICIOUS if kw in x)
    )
    features_df["body_company_keywords"] = body_lower.apply(
        lambda x: sum(1 for kw in COMPANY_IMPERSONATION if kw in x)
    )
    features_df["body_urgency_phrases"] = body_lower.apply(
        lambda x: sum(1 for phrase in URGENCY_PHRASES if phrase in x)
    )
    features_df["body_credential_requests"] = body_lower.apply(
        lambda x: sum(1 for kw in CREDENTIAL_REQUESTS if kw in x)
    )

    features_df["body_excessive_punctuation"] = features_df["body"].astype(str).apply(
        lambda x: int(bool(re.search(r"!{2,}|\?{2,}|\${2,}", x)))
    )
    features_df["body_exclamation_count"] = features_df["body"].astype(str).apply(lambda x: x.count("!"))
    features_df["body_question_count"] = features_df["body"].astype(str).apply(lambda x: x.count("?"))
    features_df["body_dollar_count"] = features_df["body"].astype(str).apply(lambda x: x.count("$"))

    features_df["body_has_html"] = features_df["body"].astype(str).apply(
        lambda x: int(bool(re.search(r"<[^>]+>", x)))
    )

    features_df["body_has_misspellings"] = body_lower.apply(
        lambda x: int(any(word in x for word in common_misspellings))
    )

    features_df["url_count"] = url_str_col.apply(
        lambda x: len(re.findall(r"https?://", x))
    )

    shorteners = ["bit.ly" , "tinyurl" , "goo.gl" , "t.co" , "ow.ly" , "is.gd" , "buff.ly"]
    features_df["url_is_shortened"] = urls_lower.apply(
        lambda x: int(any(short in x for short in shorteners))
    )

    features_df["url_avg_length"] = url_str_col.apply(
        lambda x: sum(len(url) for url in re.findall(r"https?://[^\s]+", x)) / max(1 , len(re.findall(r"https?://", x)))
    )

    features_df["url_many_subdomains"] = url_str_col.apply(
        lambda x: int(any(len(re.findall(r"\.", url.split("/")[2])) > 3
                         for url in re.findall(r"https?://[^\s]+", x) if "/" in url))
    )

    features_df["url_has_at_symbol"] = url_str_col.apply(
        lambda x: int("@" in x and "http" in x)
    )

    features_df["sender_company_mismatch"] = (
        (features_df["sender_is_freemail"] == 1) &
        ((features_df["subject_company_keywords"] > 0) |
         (features_df["body_company_keywords"] > 0))
    ).astype(int)

    features_df["total_urgent_keywords"] = features_df["subject_urgent_keywords"] + features_df["body_urgent_keywords"]
    features_df["total_threat_keywords"] = features_df["subject_threat_keywords"] + features_df["body_threat_keywords"]
    features_df["total_money_keywords"] = features_df["subject_money_keywords"] + features_df["body_money_keywords"]
    features_df["total_generic_keywords"] = features_df["subject_generic_keywords"] + features_df["body_generic_keywords"]
    features_df["total_company_keywords"] = features_df["subject_company_keywords"] + features_df["body_company_keywords"]

    features_df["urgent_keyword_ratio"] = features_df["total_urgent_keywords"] / (features_df["body_word_count"] + 1)
    features_df["threat_keyword_ratio"] = features_df["total_threat_keywords"] / (features_df["body_word_count"] + 1)
    features_df["money_keyword_ratio"] = features_df["total_money_keywords"] / (features_df["body_word_count"] + 1)
    features_df["generic_keyword_ratio"] = features_df["total_generic_keywords"] / (features_df["body_word_count"] + 1)

    features_df["url_to_text_ratio"] = features_df["url_count"] / (features_df["body_word_count"] + 1)

    features_df["is_reply_or_forward"] = features_df["subject_has_re_fwd"]

    features_df["avg_word_length"] = features_df["body"].astype(str).apply(
        lambda x: sum(len(word) for word in x.split()) / max(1 , len(x.split()))
    )

    features_df["body_uppercase_ratio"] = features_df["body"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / max(1 , len(x))
    )

    features_df["body_special_char_count"] = features_df["body"].astype(str).apply(
        lambda x: len(re.findall(r"[^a-zA-Z0-9\s]" , x))
    )

    features_df["mentions_attachment"] = body_lower.apply(
        lambda x: int(any(kw in x for kw in attachment_keywords))
    )

    features_df["body_personal_pronoun_count"] = body_lower.apply(
        lambda x: sum(1 for pronoun in personal_pronouns if pronoun in " " + x + " ")
    )

    features_df["body_action_request_count"] = body_lower.apply(
        lambda x: sum(1 for action in action_words if action in x)
    )

    features_df["has_signature"] = body_lower.apply(
        lambda x: int(bool(re.search(r"(regards|sincerely|best wishes|thanks|thank you)" , x)))
    )

    features_df["body_has_phone"] = features_df["body"].astype(str).apply(
        lambda x: int(bool(re.search(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b" , x)))
    )

    features_df["body_is_very_short"] = (features_df["body_word_count"] < 10).astype(int)

    features_df["subject_body_overlap"] = features_df.apply(
        lambda row: len(set(str(row["subject"]).lower().split()) &
                       set(str(row["body"]).lower().split())) /
                   max(1 , len(str(row["subject"]).split())),
        axis=1
    )

    features_df["link_text_no_url"] = (
        (features_df["url_count"] == 0) &
        (body_lower.apply(lambda x: "click" in x or "link" in x or "http" in x))
    ).astype(int)

    return features_df


In [None]:
features_df = extract_features(df)

features_df[[
    'weekday_sin','weekday_cos',
    'day_sin','day_cos',
    'month_sin','month_cos',
    'hour_sin','hour_cos',
    'minute_sin','minute_cos',
    'second_sin','second_cos',
    'year'
]] = features_df['date'].apply(parse_date_text)

features_df

##Phishing Detection Pipeline

In [None]:
class PhishingDetectionPipeline:

    def __init__(self):
        self.tfidf_subject = None
        self.tfidf_body = None
        self.scaler = None
        self.svd = None
        self.kmeans = None
        self.model = None
        self.subject_weight = 2
        self.numeric_features = [
            "subject_length", "subject_word_count", "body_length", "body_word_count",
            "cc_count", "bcc_count", "total_recipients", "sender_length", "sender_number_count",
            "subject_exclamation_count", "subject_question_count", "subject_dollar_count",
            "body_exclamation_count", "body_question_count", "body_dollar_count", "avg_word_length",
            "url_count", "url_avg_length", "total_urgent_keywords", "total_threat_keywords",
            "total_money_keywords", "total_generic_keywords", "total_company_keywords"
        ]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def fit(self, tfidf_subject, tfidf_body, scaler, svd, kmeans, model):
        """Store all fitted components"""
        self.tfidf_subject = tfidf_subject
        self.tfidf_body = tfidf_body
        self.scaler = scaler
        self.svd = svd
        self.kmeans = kmeans
        self.model = model
        self.model.to(self.device)
        self.model.eval()
        return self

    def transform_features(self, df):

        # TF-IDF for subject
        X_subject = self.tfidf_subject.transform(df["subject"])
        X_subject_weighted = X_subject * self.subject_weight
        df_subject_tfidf = pd.DataFrame(
            X_subject_weighted.toarray(),
            columns=[f"subj_{w}" for w in self.tfidf_subject.get_feature_names_out()],
            index=df.index
        )

        # TF-IDF for body
        X_body = self.tfidf_body.transform(df["body"])
        df_body_tfidf = pd.DataFrame(
            X_body.toarray(),
            columns=[f"body_{w}" for w in self.tfidf_body.get_feature_names_out()],
            index=df.index
        )

        # Combine TF-IDF features
        X_tfidf = pd.concat([df_subject_tfidf, df_body_tfidf], axis=1)

        # Add link_text_no_url to TF-IDF (it was included during training)
        if 'link_text_no_url' in df.columns:
            X_tfidf.insert(0, 'link_text_no_url', df['link_text_no_url'].values)
        else:
            X_tfidf.insert(0, 'link_text_no_url', 0)

        # Match SVD's expected features
        if hasattr(self.svd, 'feature_names_in_'):
            expected_features = list(self.svd.feature_names_in_)

            # Add any other missing columns with zeros
            for col in expected_features:
                if col not in X_tfidf.columns:
                    X_tfidf[col] = 0

            # Keep only expected features in the right order
            X_tfidf = X_tfidf[expected_features]

        # Apply SVD to TF-IDF features -> produces 100 components
        X_tfidf_reduced = self.svd.transform(X_tfidf)

        # Now handle "X_other" features (all non-TF-IDF features)
        # During training, you took ALL columns except TF-IDF columns
        # Get what K-Means expects to figure out what was in X_other during training
        if hasattr(self.kmeans, 'feature_names_in_'):
            kmeans_features = list(self.kmeans.feature_names_in_)

            print(f"DEBUG: K-Means expects {len(kmeans_features)} total features")

            # Remove the SVD component names to get X_other column names
            svd_cols = [f"tfidf_svd_{i}" for i in range(100)]
            other_feature_names = [f for f in kmeans_features if f not in svd_cols]

            print(f"DEBUG: Extracting {len(other_feature_names)} 'other' features")
            print(f"DEBUG: First 10 other features: {other_feature_names[:10]}")
            print(f"DEBUG: Last 10 other features: {other_feature_names[-10:]}")

            # Extract these features from df
            X_other = df.copy()

            # Add missing features with zeros
            missing_in_df = []
            for feature in other_feature_names:
                if feature not in X_other.columns:
                    missing_in_df.append(feature)
                    X_other[feature] = 0

            if missing_in_df:
                print(f"DEBUG: Missing {len(missing_in_df)} features in df, added with zeros:")
                for feat in missing_in_df[:10]:
                    print(f"  - {feat}")

            # Select only the required features
            X_other = X_other[other_feature_names].copy()

            print(f"DEBUG: X_other shape: {X_other.shape}")

            # Scale only the numeric_features that are in this subset
            if hasattr(self.scaler, 'feature_names_in_'):
                scalable_features = [f for f in self.scaler.feature_names_in_ if f in X_other.columns]
            else:
                scalable_features = [f for f in self.numeric_features if f in X_other.columns]

            print(f"DEBUG: Scaling {len(scalable_features)} features")

            X_other_scaled = X_other.copy()
            if scalable_features:
                X_other_scaled[scalable_features] = self.scaler.transform(X_other[scalable_features])
        else:
            # Fallback: use scaler features only
            if hasattr(self.scaler, 'feature_names_in_'):
                other_feature_names = list(self.scaler.feature_names_in_)
            else:
                other_feature_names = self.numeric_features

            X_other = df[other_feature_names].copy()
            X_other_scaled = X_other.copy()
            X_other_scaled[other_feature_names] = self.scaler.transform(X_other[other_feature_names])

        # Combine SVD-reduced TF-IDF with all other features
        tfidf_cols = [f"tfidf_svd_{i}" for i in range(X_tfidf_reduced.shape[1])]
        other_cols = X_other_scaled.columns.tolist()

        X_combined = pd.DataFrame(
            np.hstack([X_tfidf_reduced, X_other_scaled.values]),
            index=df.index,
            columns=tfidf_cols + other_cols
        )

        print(f"DEBUG: After combining SVD + other features: {X_combined.shape[1]} features")
        print(f"  - SVD components: {len(tfidf_cols)}")
        print(f"  - Other features: {len(other_cols)}")

        # Final check: ensure we match K-Means expectations exactly
        if hasattr(self.kmeans, 'feature_names_in_'):
            expected_kmeans_features = list(self.kmeans.feature_names_in_)

            print(f"DEBUG: K-Means expects: {len(expected_kmeans_features)} features")

            # Add any still-missing columns with zeros
            missing_cols = set(expected_kmeans_features) - set(X_combined.columns)
            if missing_cols:
                print(f"DEBUG: Adding {len(missing_cols)} missing columns:")
                for col in list(missing_cols)[:10]:
                    print(f"  - {col}")
                if len(missing_cols) > 10:
                    print(f"  ... and {len(missing_cols) - 10} more")

                for col in missing_cols:
                    X_combined[col] = 0

            # Check for extra columns
            extra_cols = set(X_combined.columns) - set(expected_kmeans_features)
            if extra_cols:
                print(f"DEBUG: Removing {len(extra_cols)} extra columns:")
                for col in list(extra_cols)[:10]:
                    print(f"  - {col}")
                if len(extra_cols) > 10:
                    print(f"  ... and {len(extra_cols) - 10} more")

            # Reorder to match K-Means
            X_combined = X_combined[expected_kmeans_features]

        print(f"DEBUG: Final shape before model: {X_combined.shape}")

        # Get cluster assignments
        cluster_labels = self.kmeans.predict(X_combined)

        return X_combined, cluster_labels

    def predict(self, df, return_proba=False):

        # Transform features
        X_combined, cluster_labels = self.transform_features(df)

        print(f"DEBUG: X_combined shape after transform: {X_combined.shape}")

        # CRITICAL FIX: Add cluster labels as a feature for the model
        # During training, cluster labels were added as a feature (181 + 1 = 182)
        X_combined['cluster'] = cluster_labels

        print(f"DEBUG: X_combined shape after adding cluster: {X_combined.shape}")

        # Convert to tensor
        X_tensor = torch.FloatTensor(X_combined.values).to(self.device)

        # Get predictions
        with torch.no_grad():
            outputs = self.model(X_tensor)
            probs = torch.sigmoid(outputs).cpu().numpy().flatten()

        if return_proba:
            return probs, cluster_labels
        else:
            predictions = (probs >= 0.5).astype(int)
            return predictions, cluster_labels

    def save(self, filepath):

        pipeline_data = {
            'tfidf_subject': self.tfidf_subject,
            'tfidf_body': self.tfidf_body,
            'scaler': self.scaler,
            'svd': self.svd,
            'kmeans': self.kmeans,
            'model_state_dict': self.model.state_dict(),
            'model_class': self.model.__class__,
            'subject_weight': self.subject_weight,
            'numeric_features': self.numeric_features
        }

        with open(filepath, 'wb') as f:
            pickle.dump(pipeline_data, f)

        print(f"Pipeline saved to {filepath}")

    @classmethod
    def load(cls, filepath, model_instance=None):

        with open(filepath, 'rb') as f:
            pipeline_data = pickle.load(f)

        pipeline = cls()
        pipeline.tfidf_subject = pipeline_data['tfidf_subject']
        pipeline.tfidf_body = pipeline_data['tfidf_body']
        pipeline.scaler = pipeline_data['scaler']
        pipeline.svd = pipeline_data['svd']
        pipeline.kmeans = pipeline_data['kmeans']
        pipeline.subject_weight = pipeline_data['subject_weight']
        pipeline.numeric_features = pipeline_data['numeric_features']

        # Load model
        if model_instance is None:
            raise ValueError("Please provide a model_instance with the same architecture")

        pipeline.model = model_instance
        pipeline.model.load_state_dict(pipeline_data['model_state_dict'])
        pipeline.model.to(pipeline.device)
        pipeline.model.eval()

        print(f"Pipeline loaded from {filepath}")
        return pipeline


#  USAGE SCRIPT

if __name__ == "__main__":
    # Load the model (expects 182 input features)
    model = PhishingDetector(182)

    # Load the pipeline
    pipeline = PhishingDetectionPipeline.load('phishing_detection_pipeline.pkl', model_instance=model)

    print("Making predictions...")

    # Make predictions
    # features_df should contain: subject, body, and all 90 features you showed earlier
    predictions, clusters = pipeline.predict(features_df)

    print(f"\n‚úì Predictions completed successfully!")
    print(f"  - Predictions shape: {predictions.shape}")
    print(f"  - Cluster assignments shape: {clusters.shape}")
    print(f"\nFirst 10 predictions: {predictions[:10]}")
    print(f"First 10 clusters: {clusters[:10]}")

    # Get probabilities
    probabilities, _ = pipeline.predict(features_df, return_proba=True)
    print(f"\nFirst 10 probabilities: {probabilities[:10]}")

    # Summary statistics
    print(f"\n" + "="*70)
    print("PREDICTION SUMMARY")
    print("="*70)
    print(f"Total emails analyzed: {len(predictions)}")
    print(f"Predicted as phishing: {predictions.sum()} ({predictions.sum()/len(predictions)*100:.1f}%)")
    print(f"Predicted as legitimate: {(predictions == 0).sum()} ({(predictions == 0).sum()/len(predictions)*100:.1f}%)")
    print(f"\nCluster distribution:")
    unique, counts = np.unique(clusters, return_counts=True)
    for cluster_id, count in zip(unique, counts):
        print(f"  Cluster {cluster_id}: {count} emails ({count/len(clusters)*100:.1f}%)")
    print("="*70)

##Call The Model

In [None]:
for col in pipeline.numeric_features:
    if col not in features_df.columns:
        features_df[col] = 0


ml_probs, clusters = pipeline.predict(
    features_df,
    return_proba=True
)


##Run Detection

In [None]:
HIGH_THRESHOLD = 0.85
MEDIUM_THRESHOLD = 0.6

for i, row in features_df.iterrows():
    ml_prob = ml_probs[i]


    if ml_prob < MEDIUM_THRESHOLD:
        continue


    if ml_prob >= HIGH_THRESHOLD:
        subject = "üö® HIGH RISK PHISHING EMAIL"
        risk_level = "HIGH RISK"
    else:
        subject = "‚ö†Ô∏è SUSPICIOUS EMAIL DETECTED"
        risk_level = "MEDIUM RISK"

    body = f"""
From: {row['from']}
Subject: {row['subject']}

Phishing Probability: {ml_prob:.3f}
Risk Level: {risk_level}
Model Decision: POSSIBLE_PHISHING
"""

    send_alert_email(subject, body)
