# Phishing Email Dataset – Cleaning & Split
This notebook cleans the raw CSV, performs a **stratified train/val/test split**, and optionally builds a token **vocabulary** for the CNN‑BiLSTM model.

It produces JSONL files ready for:
- **DeBERTa‑v3‑small fine‑tuning** (uses raw cleaned text)
- **CNN‑BiLSTM** (uses same text + vocab)

⚠️ **Update `DATA_PATH` below if your CSV lives elsewhere.**

In [None]:
import os, re, json, html, unicodedata, collections
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

In [None]:
# 🔧 Paths & constants
DATA_PATH = Path('data/raw/phishing_emails.csv')  # <-- change if needed
OUTPUT_DIR = Path('data/processed')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
VOCAB_SIZE = 20000
RANDOM_SEED = 42

In [None]:
df = pd.read_csv(DATA_PATH)
print(f'Loaded {len(df):,} rows')
print(df.head())
# Expect columns like `text` and `label` (0=legit,1=phish). Adjust if needed.

In [None]:
def clean_email(text: str) -> str:
    if pd.isna(text):
        return ''
    # Remove HTML
    text = BeautifulSoup(text, 'html.parser').get_text(' ', strip=True)
    # Decode HTML entities
    text = html.unescape(text)
    # Normalise accents → ASCII
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Lowercase & collapse whitespace
    text = re.sub(r'\s+', ' ', text.lower()).strip()
    return text

In [None]:
df['text_clean'] = df['text'].astype(str).apply(clean_email)
df = df.rename(columns={'label': 'target'})
df = df[['text_clean', 'target']]
df.head()

In [None]:
train_df, tmp_df = train_test_split(
    df, test_size=0.30, stratify=df['target'], random_state=RANDOM_SEED)
val_df, test_df = train_test_split(
    tmp_df, test_size=0.50, stratify=tmp_df['target'], random_state=RANDOM_SEED)
print({n: len(d) for n, d in [('train',train_df), ('val',val_df), ('test',test_df)]})

In [None]:
for split, d in [('train', train_df), ('val', val_df), ('test', test_df)]:
    out_path = OUTPUT_DIR / f'{split}.jsonl'
    d.rename(columns={'text_clean': 'text'})[['text', 'target']]
      .to_json(out_path, orient='records', lines=True, force_ascii=False)
    print(f'Saved {out_path} ({len(d)})')

In [None]:
def basic_tokenize(text):
    return re.findall(r"\b\w[\w'-]*\b", text)

counter = collections.Counter()
for t in tqdm(train_df['text_clean'], desc='Building vocab'):
    counter.update(basic_tokenize(t))

most_common = [w for w, _ in counter.most_common(VOCAB_SIZE-2)]  # reserve PAD/UNK
vocab_path = OUTPUT_DIR / 'vocab.txt'
with open(vocab_path, 'w', encoding='utf8') as f:
    f.write('\n'.join(['<pad>', '<unk>'] + most_common))
print(f'Vocab saved ➜ {vocab_path} (size={len(most_common)+2})')

In [None]:
print('\nClass distribution:')
for name, d in [('train', train_df), ('val', val_df), ('test', test_df)]:
    print(name, d['target'].value_counts(normalize=True).round(3).to_dict())