In [1]:
"""
Deep Learning Training Pipeline for Phishing Email Sentinel (PES)

- Trains MiniLM using supervised classification (SoftmaxLoss)
- Uses balanced dataset prepared in ml/data/processed/
- CPU-only, free, deployable
"""

import json
import logging
from pathlib import Path
from typing import List

import torch
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# =========================
# Configuration
# =========================
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
DATA_DIR = Path("ml/data/processed")
MODEL_OUT_DIR = Path("ml/models/phishing-minilm")

BATCH_SIZE = 16          # CPU-safe
EPOCHS = 3               # Enough for mini project
LEARNING_RATE = 2e-5
MAX_SAMPLES = None       # Set to int for quick testing


# =========================
# Dataset Loader
# =========================
def load_jsonl(path: Path) -> List[InputExample]:
    examples = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            record = json.loads(line)
            text = record["text"]
            label = int(record["label"])

            examples.append(
                InputExample(
                    texts=[text],
                    label=label
                )
            )

    return examples


# =========================
# Training Pipeline
# =========================
def train_phishing_model():
    logger.info("Starting DL training pipeline")

    # Load datasets
    train_path = DATA_DIR / "train.jsonl"
    val_path = DATA_DIR / "val.jsonl"

    train_examples = load_jsonl(train_path)
    val_examples = load_jsonl(val_path)

    if MAX_SAMPLES:
        train_examples = train_examples[:MAX_SAMPLES]
        val_examples = val_examples[:MAX_SAMPLES]

    logger.info(f"Training samples: {len(train_examples)}")
    logger.info(f"Validation samples: {len(val_examples)}")

    # Load base model (CPU only)
    model = SentenceTransformer(MODEL_NAME, device="cpu")

    # DataLoaders
    train_dataloader = DataLoader(
        train_examples,
        shuffle=True,
        batch_size=BATCH_SIZE
    )

    val_dataloader = DataLoader(
        val_examples,
        shuffle=False,
        batch_size=BATCH_SIZE
    )

    # Classification loss (TRUE DL)
    train_loss = losses.SoftmaxLoss(
        model=model,
        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
        num_labels=2
    )

    logger.info("Beginning fine-tuning (SoftmaxLoss)")

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=None,  # evaluation handled separately
        epochs=EPOCHS,
        optimizer_params={"lr": LEARNING_RATE},
        show_progress_bar=True
    )

    # Save fine-tuned model
    MODEL_OUT_DIR.mkdir(parents=True, exist_ok=True)
    model.save(str(MODEL_OUT_DIR))

    logger.info(f"Model saved to {MODEL_OUT_DIR}")
    logger.info("DL training completed successfully")


# =========================
# Entry Point
# =========================
if __name__ == "__main__":
    train_phishing_model()




FileNotFoundError: [Errno 2] No such file or directory: 'ml/data/processed/train.jsonl'