# Phishing Email Dataset – Preprocessing Notebook
This notebook cleans the raw CSV of phishing / legitimate e‑mails and creates **train**, **val**, and **test** JSONL splits ready for deep‑learning models.

➡️ **Update the `DATA_PATH` below if your CSV has a different filename or location.**

In [None]:
import os, re, json, unicodedata, html
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup

In [None]:
# ---- Configuration ----
DATA_PATH = Path('data/raw/phishing_emails.csv')  # 👉 change if needed
OUTPUT_DIR = Path('data/processed')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
RANDOM_SEED = 42

In [None]:
df = pd.read_csv(DATA_PATH)
print(f'Loaded {len(df)} rows from {DATA_PATH}')
df.head()

In [None]:
def clean_email(text: str) -> str:
    """Strip HTML, normalise accents, lowercase & collapse whitespace."""
    if pd.isna(text):
        return ''
    text = BeautifulSoup(text, 'html.parser').get_text(' ', strip=True)
    text = html.unescape(text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# 🔄 Adjust these column names if your CSV schema differs
df['clean_text'] = df['text'].apply(clean_email)  # 'text' ➜ original column
df = df.rename(columns={'label': 'target'})       # 'label' ➜ 0/1 class column
df[['clean_text', 'target']].head()

In [None]:
train_df, temp_df = train_test_split(
    df, test_size=0.30, stratify=df['target'], random_state=RANDOM_SEED)
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df['target'], random_state=RANDOM_SEED)

for split_name, split_df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    out_path = OUTPUT_DIR / f'{split_name}.jsonl'
    split_df[['clean_text', 'target']]
        .rename(columns={'clean_text': 'text'})
        .to_json(out_path, orient='records', lines=True, force_ascii=False)
    print(f'Saved {len(split_df)} records ➜ {out_path}')

In [None]:
print('\nClass distribution by split:')
for name, d in [('train', train_df), ('val', val_df), ('test', test_df)]:
    print(name.ljust(5), d['target'].value_counts(normalize=True).round(3).to_dict())