In [8]:
import os
import re
import html
import string
import unicodedata
from email import policy
from email.parser import BytesParser
from typing import Tuple, List, Dict, Any, Optional

import pandas as pd
import numpy as np


In [9]:
SAFE_PUNCTS = set("!?.:,;")
WORD_RE = re.compile(r"[A-Za-z']+")
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
MONEY_RE = re.compile(r"(\$|€|£)\s?\d[\d,\.]*")
NUM_RE = re.compile(r"\b\d+(?:[\.,]\d+)?\b")
HTML_TAG_RE = re.compile(r"<[^>]+>")
WHITESPACE_RE = re.compile(r"\s+")

REPLY_PREFIX_RE = re.compile(r"^\s*(re|fw|fwd)\s*[:\-]\s*", re.IGNORECASE)

STOPWORDS = {
    "the","and","is","in","to","of","a","for","on","with","that","this","it","as","at","an","be","by","from","or",
    "are","was","were","will","would","can","could","should","has","have","had","not","no","but","if","then","so",
    "do","does","did","we","you","he","she","they","them","his","her","their","our","us","i","me","my","your","yours"
}

SPAM_WORDS = {
    "free","click","offer","winner","viagra","earn","credit","urgent","limited","deal","bonus","buy","cheap",
    "win","guarantee","money","investment","casino","loan","mortgage","million","billion"
}

In [10]:
def safe_lower(s):
    return s.lower() if isinstance(s, str) else ""

In [11]:
# ç yi c
def strip_accents(s):
    if not isinstance(s, str):
        return ''
    return ''.join(
        ch for ch in unicodedata.normalize('NFKD', s)
        if not unicodedata.combining(ch)
    )

In [12]:
# delete spaces
def normalize_spaces(s):
    return WHITESPACE_RE.sub(" ", s).strip()

def html_to_text(s):
    unescaped = html.unescape(s)
    no_tags = HTML_TAG_RE.sub(" ", unescaped)
    return normalize_spaces(no_tags)


In [13]:
def replace_patterns(s: str) -> Tuple[str, Dict[str, int]]:
    counts = {
        "url_count": 0,
        "email_count": 0,
        "money_count": 0,
        "num_count": 0
    }
    def _count_and_replace(pattern: re.Pattern, repl: str, text: str, key: str) -> str:
        matches = pattern.findall(text)
        counts[key] += len(matches)
        return pattern.sub(repl, text)

    out = s
    out = _count_and_replace(URL_RE, " URLTOK ", out, "url_count")
    out = _count_and_replace(EMAIL_RE, " EMAILTOK ", out, "email_count")
    out = _count_and_replace(MONEY_RE, " MONEYTOK ", out, "money_count")
    out = _count_and_replace(NUM_RE, " NUMTOK ", out, "num_count")
    return normalize_spaces(out), counts



In [14]:
def remove_punct_keep_basic(s):
    if not isinstance(s, str):
        return ""
    keep = "".join(SAFE_PUNCTS)
    table = str.maketrans("", "", "".join(ch for ch in string.punctuation if ch not in keep))
    return s.translate(table)


In [15]:
def calc_upper_ratio(s):
    if not s:
        return 0.0
    upp = sum(1 for ch in s if ch.isalpha() and ch.isupper())
    letters = sum(1 for ch in s if ch.isalpha())
    return upp / letters if letters else 0.0

In [16]:
def non_ascii_ratio(s):
    if not s:
        return 0.0
    total = len(s)
    non_ascii = sum(1 for ch in s if ord(ch) > 127)
    return non_ascii / total if total else 0.0


In [17]:
def avg_word_len(words: List[str]) -> float:
    if not words:
        return 0.0
    return float(np.mean([len(w) for w in words if len(w) > 0])) if any(len(w)>0 for w in words) else 0.0

def lexical_diversity(words: List[str]) -> float:
    if not words:
        return 0.0
    uniq = len(set(words))
    return uniq / len(words) if len(words) else 0.0

def stopword_ratio(words: List[str]) -> float:
    if not words:
        return 0.0
    sw = sum(1 for w in words if w in STOPWORDS)
    return sw / len(words)

def spam_word_count(words: List[str]) -> int:
    return sum(1 for w in words if w in SPAM_WORDS)


In [18]:
def parse_email_file(fp: str) -> Tuple[str, str, Dict[str, Any]]:
    with open(fp, "rb") as f:
        msg = BytesParser(policy=policy.default).parse(f)

    subject = msg["subject"] or ""
    meta = {
        "is_multipart": bool(msg.is_multipart()),
        "num_parts": 1,
        "has_html_part": False
    }

    body = ""
    if msg.is_multipart():
        parts = list(msg.walk())
        meta["num_parts"] = len([p for p in parts if p.get_content_maintype() != "multipart"])
        for part in parts:
            ctype = part.get_content_type()
            if ctype == "text/plain":
                try:
                    body += part.get_content()
                except Exception:
                    try:
                        body += part.get_payload(decode=True).decode(errors="ignore")
                    except Exception:
                        pass
            elif ctype == "text/html":
                meta["has_html_part"] = True
                try:
                    body_html = part.get_content()
                except Exception:
                    try:
                        body_html = part.get_payload(decode=True).decode(errors="ignore")
                    except Exception:
                        body_html = ""
                body += " " + html_to_text(body_html)
    else:
        ctype = msg.get_content_type()
        if ctype == "text/plain":
            try:
                body = msg.get_content()
            except Exception:
                body = msg.get_payload(decode=True).decode(errors="ignore")
        elif ctype == "text/html":
            meta["has_html_part"] = True
            try:
                body_html = msg.get_content()
            except Exception:
                body_html = msg.get_payload(decode=True).decode(errors="ignore")
            body = html_to_text(body_html)
        else:
            # Fallback
            try:
                body = msg.get_content()
            except Exception:
                try:
                    body = msg.get_payload(decode=True).decode(errors="ignore")
                except Exception:
                    body = ""

    return subject or "", body or "", meta


In [19]:
def build_clean_text(subject: str, body: str) -> Tuple[str, Dict[str, Any]]:
    subj = subject if isinstance(subject, str) else ""
    subj_unescaped = html.unescape(subj)
    subj_no_tags = HTML_TAG_RE.sub(" ", subj_unescaped)
    subj_norm = normalize_spaces(subj_no_tags)

    body_raw = body if isinstance(body, str) else ""
    
    body_unescaped = html.unescape(body_raw)
    body_no_tags = HTML_TAG_RE.sub(" ", body_unescaped)
    body_norm = normalize_spaces(body_no_tags)
    merged = normalize_spaces(f"{subj_norm} . {body_norm}")
    replaced, counts = replace_patterns(merged)
    no_punct = remove_punct_keep_basic(replaced)
    lowered = safe_lower(no_punct)
    deaccent = strip_accents(lowered)
    cleaned = normalize_spaces(deaccent)
    return cleaned, counts


In [20]:
def tokenize_words(text: str) -> List[str]:
    if not text:
        return []
    return [m.group(0).lower() for m in WORD_RE.finditer(text)]

def subject_features(subject: str) -> Dict[str, Any]:
    s = subject or ""
    feats = {
        "subject_len": len(s),
        "subject_upper_ratio": calc_upper_ratio(s),
        "subject_exclaim": s.count("!"),
        "subject_has_reply_prefix": 1 if REPLY_PREFIX_RE.search(s or "") else 0,
    }
    return feats


In [21]:
def structural_features(text: str, meta: Dict[str, Any]) -> Dict[str, Any]:
    s = text or ""
    lines = s.splitlines()
    num_lines = len(lines)
    mean_line_len = float(np.mean([len(li) for li in lines])) if num_lines else 0.0
    exclam_count = s.count("!")
    punct_count = sum(1 for ch in s if ch in string.punctuation)
    feats = {
        "is_multipart": int(bool(meta.get("is_multipart", False))),
        "num_parts": int(meta.get("num_parts", 1)),
        "has_html_part": int(bool(meta.get("has_html_part", False))),
        "num_lines": num_lines,
        "mean_line_len": mean_line_len,
        "exclam_count": exclam_count,
        "punct_count": punct_count,
        "upper_ratio": calc_upper_ratio(s),
        "non_ascii_ratio": non_ascii_ratio(s),
        "char_count": len(s),
    }
    return feats


In [22]:
def word_based_features(clean_text: str) -> Dict[str, Any]:
    words = tokenize_words(clean_text)
    feats = {
        "word_count": len(words),
        "avg_word_len": avg_word_len(words),
        "lexical_diversity": lexical_diversity(words),
        "stopword_ratio": stopword_ratio(words),
        "spam_word_count": spam_word_count(words),
    }
    return feats


In [23]:
def header_domain_features(from_header: Optional[str]) -> Dict[str, Any]:
    dom = ""
    tld = ""
    if from_header:
        m = EMAIL_RE.search(from_header)
        if m:
            addr = m.group(0)
            try:
                dom = addr.split("@", 1)[1].lower()
            except Exception:
                dom = ""
    if dom:
        parts = dom.split(".")
        if len(parts) >= 2:
            tld = parts[-1]
    return {
        "from_domain": dom,
        "from_tld": tld
    }


In [24]:
def iter_dataset_files(data_dir: str) -> List[Tuple[str, str]]:
    label_map = {
        "easy_ham": "ham",
        "easy_ham_2": "ham",
        "hard_ham": "ham",
        "spam": "spam",
        "spam_2": "spam",
    }
    pairs: List[Tuple[str, str]] = []
    for sub in sorted(os.listdir(data_dir)):
        sub_path = os.path.join(data_dir, sub)
        if not os.path.isdir(sub_path):
            continue
        if sub not in label_map:
            continue
        lab = label_map[sub]
        for root, _, files in os.walk(sub_path):
            for fn in files:
                fp = os.path.join(root, fn)
                if os.path.isfile(fp) and not fn.startswith("."):
                    pairs.append((lab, fp))
    return pairs

In [25]:
def process_row(label: str, file_path: str) -> Dict[str, Any]:
    subj, body, meta = parse_email_file(file_path)
    clean_text, counts = build_clean_text(subj, body)

    subject_feats = subject_features(subj)
    structural_feats = structural_features(f"{subj} . {body}", meta)
    word_feats = word_based_features(clean_text)
    from_header = ""
    try:
        with open(file_path, "rb") as f:
            msg = BytesParser(policy=policy.default).parse(f)
        from_header = msg["from"] or ""
    except Exception:
        from_header = ""
    header_feats = header_domain_features(from_header)

    out = {
        "path": file_path,
        "subject": subj,
        "body": body,
        "text_all": f"{subj} . {body}",
        "clean_text": clean_text,
        "label": label,
        "label_encoded": 1 if label == "spam" else 0,
    }
    out.update(counts)
    out.update(subject_feats)
    out.update(structural_feats)
    out.update(word_feats)
    out.update(header_feats)

    return out

In [26]:
def run_processing(
    data_dir: str = "data",
    output_file: str = "emails_prepared.csv",
) -> pd.DataFrame:
    pairs = iter_dataset_files(data_dir)
    records: List[Dict[str, Any]] = []

    for lab, fp in pairs:
        try:
            rec = process_row(lab, fp)
            records.append(rec)
        except Exception as e:
            print(f"[WARN] Parse/Process error: {fp} -> {e}")

    df = pd.DataFrame.from_records(records)

    for col in ["subject", "body", "text_all", "clean_text", "from_domain", "from_tld"]:
        if col in df.columns:
            df[col] = df[col].fillna("").astype(str)

    numeric_cols = [c for c in df.columns if c not in ["path","subject","body","text_all","clean_text","label","from_domain","from_tld"]]
    for c in numeric_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

    save_path = os.path.join(os.getcwd(), output_file)
    df.to_csv(save_path, index=False)
    print(f"[OK] saved: {save_path} (shape={df.shape})")

    return df


In [27]:
def process_incoming_email(subject: str, body: str) -> pd.DataFrame:
    meta = {"is_multipart": False, "num_parts": 1, "has_html_part": ("<html" in (body or "").lower())}
    clean_text, counts = build_clean_text(subject, body)
    subject_feats = subject_features(subject)
    structural_feats = structural_features(f"{subject} . {body}", meta)
    word_feats = word_based_features(clean_text)
    header_feats = {"from_domain": "", "from_tld": ""}

    out = {
        "path": "",
        "subject": subject or "",
        "body": body or "",
        "text_all": f"{subject or ''} . {body or ''}",
        "clean_text": clean_text,
        "label": "",
        "label_encoded": 0,
    }
    out.update(counts)
    out.update(subject_feats)
    out.update(structural_feats)
    out.update(word_feats)
    out.update(header_feats)

    return pd.DataFrame([out])


In [28]:
def process_incoming_rfc822(raw_bytes: bytes) -> pd.DataFrame:
    with open("_tmp_incoming.eml", "wb") as f:
        f.write(raw_bytes)
    try:
        subj, body, meta = parse_email_file("_tmp_incoming.eml")
        df = process_incoming_email(subj, body)
        df["is_multipart"] = int(bool(meta.get("is_multipart", False)))
        df["num_parts"] = int(meta.get("num_parts", 1))
        df["has_html_part"] = int(bool(meta.get("has_html_part", False)))
        return df
    finally:
        try:
            os.remove("_tmp_incoming.eml")
        except Exception:
            pass


In [29]:
run_processing(data_dir="data", output_file="emails_prepared.csv")

[OK] Kaydedildi: /home/kahir/Desktop/E-MAIL-REAL/emails_prepared.csv (shape=(9353, 32))


Unnamed: 0,path,subject,body,text_all,clean_text,label,label_encoded,url_count,email_count,money_count,...,upper_ratio,non_ascii_ratio,char_count,word_count,avg_word_len,lexical_diversity,stopword_ratio,spam_word_count,from_domain,from_tld
0,data/easy_ham/00001.7c53336b37003a9286aba55d29...,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -05...",Re: New Sequences Window . Date: We...,"re: new sequences window . date: wed, numtok a...",ham,0,1,1,0,...,0.047527,0.000000,1631,242,4.805785,0.516529,0.214876,0,munnari.oz.au,au
1,data/easy_ham/00002.9c4069e25e1ef370c078db7ee8...,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",[zzzzteana] RE: Alexander . Martin A posted:\n...,zzzzteana re: alexander . martin a posted: tas...,ham,0,2,1,0,...,0.077441,0.000000,927,110,4.927273,0.745455,0.327273,1,cursor-system.com,com
2,data/easy_ham/00003.860e3c3cee1b42ead714c5c874...,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,[zzzzteana] Moscow bomber . Man Threatens Expl...,zzzzteana moscow bomber . man threatens explos...,ham,0,2,1,0,...,0.073933,0.000000,1779,252,5.023810,0.595238,0.309524,1,2ubh.com,com
3,data/easy_ham/00004.864220c5b6930b209cc287c361...,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,[IRR] Klez: The Virus That Won't Die . Klez: ...,irr klez: the virus that wont die . klez: the ...,ham,0,2,1,0,...,0.059929,0.000000,1167,169,4.887574,0.644970,0.295858,0,roscom.com,com
4,data/easy_ham/00005.bf27cdeaf0b8c4647ecd61b1d0...,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",Re: [zzzzteana] Nothing like mama used to make...,re: zzzzteana nothing like mama used to make ....,ham,0,3,1,0,...,0.060893,0.000000,1101,145,4.531034,0.703448,0.344828,1,ee.ed.ac.uk,uk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9348,data/spam_2/01397.f75f0dd0dd923faefa3e9cc5ecb8...,Preferred Non-Smoker Rates for Smokers,\t Preferred Non-Smoker\n \t\n Just what the ...,Preferred Non-Smoker Rates for Smokers . \t P...,preferred nonsmoker rates for smokers . prefer...,spam,1,2,0,8,...,0.103601,0.000000,2246,317,5.309148,0.321767,0.198738,1,insiq.us,us
9349,data/spam_2/01400.b444b69845db2fa0a4693ca04e6a...,[ILUG] WILSON KAMELA,ATTN:SIR/MADAN \n\n ...,[ILUG] WILSON KAMELA . ATTN:SIR/MADAN \n...,ilug wilson kamela . attn:sirmadan strictly co...,spam,1,1,3,1,...,0.055612,0.000000,2575,394,4.928934,0.510152,0.403553,11,netscape.net,net
9350,data/spam_2/01398.8ca7045aae4184d56e8509dc5ad6...,"How to get 10,000 FREE hits per day to any web...","Dear Subscriber,\n\nIf I could show you a way ...","How to get 10,000 FREE hits per day to any web...",how to get numtok free hits per day to any web...,spam,1,7,1,0,...,0.026490,0.000000,2526,349,4.550143,0.530086,0.409742,10,yahoo.lv,lv
9351,data/spam_2/01399.2319643317e2c5193d574e40a718...,Cannabis Difference,****Mid-Summer Customer Appreciation SALE!****...,Cannabis Difference . ****Mid-Summer Customer ...,cannabis difference . midsummer customer appre...,spam,1,0,1,80,...,0.077989,0.000252,23803,3417,5.578578,0.282411,0.240269,22,dialix.oz.au,au
