In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.12-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.12-py3-none-any.whl (176 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.6/176.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.12


In [None]:
import os
import re
import hashlib
import warnings
from typing import List, Dict, Tuple, Optional, Protocol
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats, sparse

# ML Stack
from sklearn.model_selection import GroupKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, accuracy_score, f1_score,
    confusion_matrix, precision_recall_fscore_support
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier

# NLP
import spacy
import textstat
from sentence_transformers import SentenceTransformer

import joblib
import json

warnings.filterwarnings("ignore")

# =============================================================================
# CONFIGURATION
# =============================================================================
BASE_DIR = "/content/paper_artifacts_unified_stacking"
MODELS_DIR = f"{BASE_DIR}/models"
OUT_DIR = f"{BASE_DIR}/outputs"
RANDOM_SEED = 42
NUM_FOLDS = 5
MAX_FEATURES = 15000
NGRAM_RANGE = (1, 2)
SBERT_MODEL_NAME = "all-mpnet-base-v2"
TARGET_NAMES = ["AI Phishing", "Enron Ham", "Manual Spam", "Nigerian Scam"]

for d in [BASE_DIR, MODELS_DIR, OUT_DIR]:
    os.makedirs(d, exist_ok=True)

# =============================================================================
# DOMAIN MODEL - Value Objects (IDENTICAL TO DEBERTA)
# =============================================================================

@dataclass(frozen=True)
class EmailText:
    """Value object representing cleaned email text with validation."""
    content: str
    content_hash: str

    @classmethod
    def create(cls, raw_text: str) -> Optional['EmailText']:
        """Factory method with validation."""
        if not isinstance(raw_text, str) or len(raw_text.strip()) < 10:
            return None

        cleaned = TextNormalizer.normalize(raw_text)
        if not cls._is_valid(cleaned):
            return None

        return cls(
            content=cleaned,
            content_hash=hashlib.sha256(cleaned.encode()).hexdigest()
        )

    @staticmethod
    def _is_valid(text: str) -> bool:
        """Quality validation."""
        if len(text) < 10 or len(text) > 10000:
            return False
        alpha_ratio = sum(c.isalpha() for c in text) / max(1, len(text))
        return alpha_ratio >= 0.1


@dataclass
class ValidationMetrics:
    """Value object for model performance metrics with statistical rigor."""
    accuracy: float
    f1_macro: float
    f1_weighted: float
    precision_per_class: np.ndarray
    recall_per_class: np.ndarray
    f1_per_class: np.ndarray
    confusion_matrix: np.ndarray

    # Statistical measures
    accuracy_std: Optional[float] = None
    f1_macro_std: Optional[float] = None
    confidence_interval_95: Optional[Tuple[float, float]] = None

    def summary(self) -> str:
        """Human-readable summary with confidence intervals."""
        ci_str = ""
        if self.confidence_interval_95 and not np.isnan(self.confidence_interval_95[0]):
            ci_str = f" (95% CI: [{self.confidence_interval_95[0]:.4f}, {self.confidence_interval_95[1]:.4f}])"

        std_str = ""
        if self.accuracy_std and not np.isnan(self.accuracy_std):
            std_str = f" ± {self.accuracy_std:.4f}"

        return (
            f"Accuracy: {self.accuracy:.4f}{std_str}{ci_str}\n"
            f"F1-Macro: {self.f1_macro:.4f} ± {self.f1_macro_std:.4f}\n"
            f"F1-Weighted: {self.f1_weighted:.4f}"
        )


# =============================================================================
# DOMAIN SERVICES - Text Processing (IDENTICAL TO DEBERTA)
# =============================================================================

class TextNormalizer:
    """Service for text normalization with entity replacement."""

    URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
    EMAIL_RE = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
    PHONE_RE = re.compile(r'\+?\d[\d\-\s]{6,}\d')

    @classmethod
    def normalize(cls, text: str) -> str:
        """Enhanced normalization with entity masking."""
        if not isinstance(text, str):
            return ""

        # Entity replacement (preserves semantic meaning while normalizing)
        text = cls.URL_RE.sub(" [URL] ", text)
        text = cls.EMAIL_RE.sub(" [EMAIL] ", text)
        text = cls.PHONE_RE.sub(" [PHONE] ", text)
        text = re.sub(r'\d{10,}', '[NUMBER]', text)
        text = re.sub(r'\$\d+(?:,\d{3})*(?:\.\d{2})?', '[MONEY]', text)

        # Whitespace normalization
        text = re.sub(r'\s+', ' ', text).strip()
        return text


class PhishingLexicon:
    """Domain service managing phishing-specific vocabulary."""

    IMPERATIVES = frozenset([
        "verify", "confirm", "update", "click", "respond", "login",
        "transfer", "act", "download", "open", "validate", "check",
        "reactivate", "restore", "unlock", "accept", "submit", "review"
    ])

    URGENCY_MARKERS = frozenset([
        "urgent", "immediately", "now", "asap", "expire", "expires",
        "suspension", "suspended", "limited", "deadline", "final",
        "warning", "alert", "action required", "time sensitive"
    ])

    FINANCIAL_TERMS = frozenset([
        "account", "bank", "card", "credit", "debit", "payment",
        "transaction", "money", "fund", "transfer", "refund",
        "owed", "debt", "paypal", "invoice", "billing"
    ])

    SCAM_KEYWORDS = [
        "inheritance", "beneficiary", "fund transfer", "million",
        "attorney", "diplomat", "widow", "lottery", "barrister",
        "deceased", "contract", "compensation", "unclaimed"
    ]

    @classmethod
    def compute_scam_score(cls, text: str) -> int:
        """Calculate Nigerian scam likelihood."""
        text_lower = text.lower()
        return sum(1 for kw in cls.SCAM_KEYWORDS if kw in text_lower)


# =============================================================================
# DOMAIN SERVICES - Stylometric Feature Extraction
# =============================================================================

class StylometricExtractor:
    """Service for extracting linguistic features using SpaCy."""

    def __init__(self, spacy_model: str = "en_core_web_sm"):
        try:
            self.nlp = spacy.load(spacy_model, disable=["ner"])
        except OSError:
            print(f"⚠ Downloading {spacy_model}...")
            os.system(f"python -m spacy download {spacy_model}")
            self.nlp = spacy.load(spacy_model, disable=["ner"])

    def extract(self, texts: List[str]) -> pd.DataFrame:
        """Extract comprehensive stylometric features."""
        features_list = []

        for doc in tqdm(self.nlp.pipe(texts, batch_size=50), total=len(texts), desc="Stylometry"):
            features = self._extract_single(doc)
            features_list.append(features)

        return pd.DataFrame(features_list)

    def _extract_single(self, doc) -> Dict[str, float]:
        """Extract features from a single document."""
        text = doc.text
        tokens = [t for t in doc if not t.is_space]

        # Basic metrics
        word_count = len([t for t in tokens if t.is_alpha])
        sentence_count = max(len(list(doc.sents)), 1)

        # Lexical diversity
        unique_words = len(set(t.lower_ for t in tokens if t.is_alpha))
        type_token_ratio = unique_words / max(word_count, 1)

        # Word length
        word_lengths = [len(t.text) for t in tokens if t.is_alpha]
        avg_word_len = np.mean(word_lengths) if word_lengths else 0

        # Sentence length
        avg_sent_len = word_count / sentence_count

        # POS distribution
        pos_counts = Counter(t.pos_ for t in tokens)
        total_pos = max(len(tokens), 1)
        noun_ratio = pos_counts.get("NOUN", 0) / total_pos
        verb_ratio = pos_counts.get("VERB", 0) / total_pos
        adj_ratio = pos_counts.get("ADJ", 0) / total_pos
        adv_ratio = pos_counts.get("ADV", 0) / total_pos

        # Punctuation
        punct_count = len([t for t in tokens if t.is_punct])
        punct_ratio = punct_count / max(len(tokens), 1)
        exclamation_count = text.count('!')
        question_count = text.count('?')

        # Capitalization
        all_caps_words = sum(1 for t in tokens if t.is_alpha and t.text.isupper() and len(t.text) > 1)
        caps_ratio = all_caps_words / max(word_count, 1)

        # Readability
        flesch_reading_ease = textstat.flesch_reading_ease(text)
        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)

        # Domain-specific
        text_lower = text.lower()
        imperative_count = sum(1 for word in PhishingLexicon.IMPERATIVES if word in text_lower)
        urgency_count = sum(1 for marker in PhishingLexicon.URGENCY_MARKERS if marker in text_lower)
        financial_count = sum(1 for term in PhishingLexicon.FINANCIAL_TERMS if term in text_lower)
        scam_score = PhishingLexicon.compute_scam_score(text)

        return {
            'word_count': word_count,
            'sentence_count': sentence_count,
            'type_token_ratio': type_token_ratio,
            'avg_word_len': avg_word_len,
            'avg_sent_len': avg_sent_len,
            'noun_ratio': noun_ratio,
            'verb_ratio': verb_ratio,
            'adj_ratio': adj_ratio,
            'adv_ratio': adv_ratio,
            'punct_ratio': punct_ratio,
            'exclamation_count': exclamation_count,
            'question_count': question_count,
            'caps_ratio': caps_ratio,
            'flesch_reading_ease': flesch_reading_ease,
            'flesch_kincaid_grade': flesch_kincaid_grade,
            'imperative_count': imperative_count,
            'urgency_count': urgency_count,
            'financial_count': financial_count,
            'scam_score': scam_score
        }


def compute_light_stylometry(text: str) -> np.ndarray:
    """Fast lightweight stylometric features (no SpaCy)."""
    words = text.split()
    word_count = len(words)
    char_count = len(text)

    # Basic ratios
    avg_word_len = char_count / max(word_count, 1)
    upper_ratio = sum(1 for c in text if c.isupper()) / max(char_count, 1)
    digit_ratio = sum(1 for c in text if c.isdigit()) / max(char_count, 1)
    punct_ratio = sum(1 for c in text if c in '.,!?;:') / max(char_count, 1)

    # Exclamation and question marks
    exclamation_count = text.count('!')
    question_count = text.count('?')

    # URL and email counts (normalized entities)
    url_count = text.count('[URL]')
    email_count = text.count('[EMAIL]')

    return np.array([
        word_count,
        avg_word_len,
        upper_ratio,
        digit_ratio,
        punct_ratio,
        exclamation_count,
        question_count,
        url_count,
        email_count
    ])


# =============================================================================
# DATA LOADING - UNIFIED WITH DEBERTA
# =============================================================================

class BalancedDatasetBuilder:
    """
    Enhanced dataset builder - IDENTICAL to DeBERTa's approach.
    Creates canonical schema: text, headers, html, attachments, label, content_hash
    """

    TARGET_NAMES = TARGET_NAMES

    def __init__(self, random_seed: int = 42):
        self.random_seed = random_seed

    def build(self, ai_csv_path: str) -> pd.DataFrame:
        """Build dataset with proper deduplication and quality checks."""
        print("\n" + "="*70)
        print("BUILDING CANONICAL DATASET (UNIFIED WITH DEBERTA)")
        print("="*70)

        # Load AI emails with FULL metadata (same as DeBERTa)
        ai_df = self._process_ai_emails(ai_csv_path)

        # Load Enron
        from datasets import load_dataset
        enron = load_dataset("SetFit/enron_spam", split="train").to_pandas()

        ham_all = enron[enron.label == 0][["text"]].copy()
        spam_all = enron[enron.label == 1][["text"]].copy()

        # Classify spam subtypes
        spam_all["scam_score"] = spam_all["text"].apply(PhishingLexicon.compute_scam_score)
        nigerian_df = spam_all[spam_all.scam_score >= 3][["text"]].copy()
        manual_spam_df = spam_all[spam_all.scam_score < 3][["text"]].copy()

        print(f"\nRaw counts:")
        print(f"  AI Phishing: {len(ai_df)}")
        print(f"  Enron Ham: {len(ham_all)}")
        print(f"  Manual Spam: {len(manual_spam_df)}")
        print(f"  Nigerian Scam: {len(nigerian_df)}")

        # Clean and deduplicate each class using EmailText value objects
        datasets = {
            0: self._clean_and_dedup(ai_df, has_metadata=True),
            1: self._clean_and_dedup(ham_all, has_metadata=False),
            2: self._clean_and_dedup(manual_spam_df, has_metadata=False),
            3: self._clean_and_dedup(nigerian_df, has_metadata=False)
        }

        print(f"\nAfter cleaning & deduplication:")
        for label, df in datasets.items():
            print(f"  {self.TARGET_NAMES[label]}: {len(df)}")

        # Smart balancing (reasonable cap, not aggressive downsampling)
        max_samples_per_class = 2000

        balanced_parts = []
        for label, df in datasets.items():
            df["label"] = label

            if len(df) > max_samples_per_class:
                sampled = df.sample(n=max_samples_per_class, random_state=self.random_seed)
                print(f"  ⚠ {self.TARGET_NAMES[label]}: Capped at {max_samples_per_class}")
            else:
                sampled = df

            balanced_parts.append(sampled)

        # Combine
        df_final = pd.concat(balanced_parts, ignore_index=True)
        df_final = df_final.sample(frac=1.0, random_state=self.random_seed).reset_index(drop=True)

        # Global deduplication
        before = len(df_final)
        df_final = df_final.drop_duplicates("text", keep='first')
        after = len(df_final)

        if before > after:
            print(f"\n✓ Removed {before - after} global duplicates")

        print(f"\n{'='*70}")
        print("FINAL CANONICAL DATASET SCHEMA")
        print(f"{'='*70}")
        print(f"Total samples: {len(df_final)}")
        print(f"\nCanonical columns present:")
        for col in df_final.columns:
            print(f"  ✓ {col}")

        print(f"\n⚠️ LOGGING: Columns after preprocessing (before model training):")
        print(f"  Columns: {list(df_final.columns)}")
        print(f"  Shape: {df_final.shape}")

        print(f"\nClass distribution:")
        class_dist = df_final["label"].value_counts().sort_index()
        for label, count in class_dist.items():
            print(f"  {label} - {self.TARGET_NAMES[label]}: {count}")

        # Compute class weights
        class_weights = compute_class_weight(
            'balanced',
            classes=np.unique(df_final["label"]),
            y=df_final["label"]
        )
        print(f"\nClass weights (for training):")
        for label, weight in enumerate(class_weights):
            print(f"  {self.TARGET_NAMES[label]}: {weight:.3f}")

        return df_final

    def _process_ai_emails(self, csv_path: str) -> pd.DataFrame:
        """
        Process AI emails with metadata extraction - IDENTICAL to DeBERTa.
        Returns: text, headers, html, attachments columns
        """
        try:
            df = pd.read_csv(csv_path)
            print(f"✓ Loaded {len(df)} AI emails")
        except FileNotFoundError:
            print("⚠ AI emails CSV not found. Using dummy data.")
            df = pd.DataFrame({
                "plain": [f"Test phishing {i}" for i in range(100)],
                "headers": [f"Subject: Urgent {i}\nFrom: test@example.com" for i in range(100)],
                "html": ["<html><body>Test</body></html>" for _ in range(100)],
                "attachments": ["[]" for _ in range(100)]
            })

        # Flexible column handling (IDENTICAL to DeBERTa)
        if "headers" not in df.columns and "header" in df.columns:
            df = df.rename(columns={"header": "headers"})
        if "plain" not in df.columns and "text" in df.columns:
            df = df.rename(columns={"text": "plain"})

        # Fill missing values
        for col in ["headers", "plain", "html", "attachments"]:
            if col not in df.columns:
                df[col] = ""
            else:
                df[col] = df[col].fillna("")

        # Extract subject (IDENTICAL to DeBERTa)
        df["subject"] = df["headers"].apply(lambda h:
            re.search(r'(?i)^Subject:\s*(.+)$', str(h), re.MULTILINE).group(1).strip()
            if re.search(r'(?i)^Subject:\s*(.+)$', str(h), re.MULTILINE) else ""
        )

        # Combine subject + body (IDENTICAL to DeBERTa)
        df["text"] = ("Subject: " + df["subject"] + "\n" + df["plain"]).str.strip()

        print(f"  ✓ Created canonical columns: text, headers, html, attachments")

        return df[["text", "headers", "html", "attachments"]]

    def _clean_and_dedup(self, df: pd.DataFrame, has_metadata: bool = False) -> pd.DataFrame:
        """
        Clean text using EmailText value objects and remove duplicates.
        UNIFIED: Preserves metadata columns even if not used for feature extraction.
        """
        df = df.copy()

        # Create EmailText value objects for validation
        valid_emails = [EmailText.create(t) for t in df["text"]]
        df["email_obj"] = valid_emails
        df = df[df["email_obj"].notna()].copy()
        df["text"] = df["email_obj"].apply(lambda e: e.content)
        df["content_hash"] = df["email_obj"].apply(lambda e: e.content_hash)

        # Preserve metadata columns (even if Enron doesn't have them)
        if not has_metadata:
            # Enron data has no metadata - add empty columns to maintain schema
            df["headers"] = ""
            df["html"] = ""
            df["attachments"] = ""

        # Deduplicate
        df = df.drop_duplicates("text")

        # Ensure all required canonical columns
        for col in ["text", "headers", "html", "attachments", "content_hash"]:
            if col not in df.columns:
                df[col] = ""

        return df.drop(columns=["email_obj"]).reset_index(drop=True)


# =============================================================================
# DOMAIN SERVICES - GroupKFold Cross-Validation (IDENTICAL TO DEBERTA)
# =============================================================================

class GroupedCrossValidator:
    """Cross-validation using GroupKFold to prevent leakage."""

    def __init__(self, n_splits: int = 5):
        self.n_splits = n_splits
        self.gkf = GroupKFold(n_splits=n_splits)

    def create_groups(self, texts: np.ndarray) -> np.ndarray:
        """Create groups based on content hash."""
        groups = np.array([hashlib.sha256(str(text).encode()).hexdigest()[:8]
                          for text in texts])
        return groups

    def split(self, X: np.ndarray, y: np.ndarray) -> List[Tuple[np.ndarray, np.ndarray]]:
        """Generate train/val splits grouped by content hash."""
        groups = self.create_groups(X)
        splits = list(self.gkf.split(X, y, groups=groups))

        print(f"\n✓ Created {len(splits)} GroupKFold splits")
        print(f"  Unique groups: {len(np.unique(groups))}")

        return splits


# =============================================================================
# APPLICATION SERVICE - Stacking Ensemble Trainer
# =============================================================================

class EnsembleTrainer:
    """
    Complete training pipeline with class weights and proper statistics.
    DIFFERENT FEATURE EXTRACTION from DeBERTa (stylometry + TF-IDF + SBERT).
    """

    TARGET_NAMES = TARGET_NAMES

    def __init__(self, random_seed: int = 42, n_folds: int = 5):
        self.random_seed = random_seed
        self.n_folds = n_folds
        self.cv = GroupedCrossValidator(n_splits=n_folds)
        self.stylo_extractor = StylometricExtractor()
        self.sbert = SentenceTransformer(SBERT_MODEL_NAME)

    def train(self, df: pd.DataFrame) -> Dict:
        """Complete training with statistical rigor."""
        print("\n" + "="*70)
        print("TRAINING STACKING ENSEMBLE")
        print("="*70)
        print("Feature extraction strategy:")
        print("  ✓ Text → TF-IDF (for Naive Bayes)")
        print("  ✓ Text → Stylometry (enhanced linguistic features)")
        print("  ✓ Text → Light Stylometry (fast features for Random Forest)")
        print("  ✓ Text → SBERT embeddings (semantic features)")
        print("  ✗ Headers/HTML/Attachments → NOT used (different from DeBERTa)")
        print("\nNote: DeBERTa uses headers/html/attachments for 24 engineered features.")
        print("      Stacking focuses on textual + stylometric essence instead.")

        # Prepare data - using ONLY text column for feature extraction
        X_all = df["text"].values
        y_all = df["label"].values

        print(f"\n⚠️ LOGGING: Input to model training:")
        print(f"  Using column: 'text' (normalized email content)")
        print(f"  Shape: {X_all.shape}")
        print(f"  Other canonical columns present but not used: headers, html, attachments")

        # Train/test split (stratified)
        X_train, X_test, y_train, y_test = train_test_split(
            X_all, y_all,
            test_size=0.2,
            stratify=y_all,
            random_state=self.random_seed
        )

        print(f"\n✓ Train: {len(X_train)}, Test: {len(X_test)}")
        print(f"\nTest set distribution:")
        test_dist = pd.Series(y_test).value_counts().sort_index()
        for label, count in test_dist.items():
            print(f"  {self.TARGET_NAMES[label]}: {count}")

        # Extract features from TEXT ONLY
        features = self._extract_features(X_train, X_test)

        # Cross-validation
        fold_metrics, oof_preds, test_preds_accum = self._cross_validate(
            X_train, y_train, features["train"]
        )

        # Train final meta-model
        final_metrics, y_pred, y_proba = self._train_final_model(
            oof_preds, y_train,
            test_preds_accum, y_test,
            features
        )

        # Bootstrap CI
        final_metrics = self._add_bootstrap_ci(
            y_pred, y_test, y_proba, final_metrics
        )

        # Ablation study
        ablation_results = self._ablation_study(test_preds_accum, y_test, features)

        # Save detailed report
        self._save_classification_report(y_test, y_pred)

        return {
            "metrics": final_metrics,
            "fold_metrics": fold_metrics,
            "ablation": ablation_results,
            "test_data": (X_test, y_test)
        }

    def _extract_features(self, X_train, X_test) -> Dict:
        """Extract all feature types from TEXT ONLY."""
        print("\n" + "="*60)
        print("FEATURE EXTRACTION FROM TEXT COLUMN")
        print("="*60)

        # Stylometry
        print("\n1. Enhanced Stylometry...")
        stylo_train = self.stylo_extractor.extract(list(X_train))
        stylo_test = self.stylo_extractor.extract(list(X_test))

        # Light stylometry
        print("\n2. Light Stylometry...")
        light_train = np.array([compute_light_stylometry(t) for t in tqdm(X_train, desc="Light")])
        light_test = np.array([compute_light_stylometry(t) for t in tqdm(X_test, desc="Light")])

        # SBERT
        print("\n3. SBERT Embeddings...")
        E_train = self.sbert.encode(list(X_train), show_progress_bar=True, batch_size=32)
        E_test = self.sbert.encode(list(X_test), show_progress_bar=True, batch_size=32)

        print(f"\n✓ Feature extraction complete:")
        print(f"  Stylometry: {stylo_train.shape}")
        print(f"  Light Stylometry: {light_train.shape}")
        print(f"  SBERT: {E_train.shape}")

        return {
            "train": {
                "stylo": stylo_train,
                "light": light_train,
                "sbert": E_train,
                "texts": X_train
            },
            "test": {
                "stylo": stylo_test,
                "light": light_test,
                "sbert": E_test,
                "texts": X_test
            }
        }

    def _cross_validate(self, X_train, y_train, features_train) -> Tuple:
        """Run GroupKFold cross-validation with class weights."""
        print("\n" + "="*70)
        print(f"CROSS-VALIDATION ({self.n_folds} FOLDS WITH CLASS WEIGHTS)")
        print("="*70)

        splits = self.cv.split(X_train, y_train)

        # OOF predictions storage
        oof_preds = {
            "nb": np.zeros((len(y_train), 4)),
            "rf": np.zeros((len(y_train), 4)),
            "xgb": np.zeros((len(y_train), 4)),
            "sbt": np.zeros((len(y_train), 4))
        }

        # For storing test predictions across folds
        test_preds_accum = {
            "nb": [],
            "rf": [],
            "xgb": [],
            "sbt": []
        }

        fold_metrics = []

        for fold, (train_idx, val_idx) in enumerate(splits):
            print(f"\nFold {fold + 1}/{self.n_folds}")
            print(f"  Train: {len(train_idx)}, Val: {len(val_idx)}")

            # Compute class weights for this fold
            class_weights_dict = {}
            class_weights = compute_class_weight(
                'balanced',
                classes=np.unique(y_train[train_idx]),
                y=y_train[train_idx]
            )
            for i, weight in enumerate(class_weights):
                class_weights_dict[i] = weight

            # Sample weights for tree models
            sample_weights = np.array([class_weights_dict.get(y, 1.0) for y in y_train[train_idx]])

            # ===== 1. Naive Bayes (TF-IDF) =====
            tfidf = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words="english")
            X_tfidf_tr = tfidf.fit_transform(features_train["texts"][train_idx])
            X_tfidf_val = tfidf.transform(features_train["texts"][val_idx])

            nb = MultinomialNB(alpha=0.1)
            nb.fit(X_tfidf_tr, y_train[train_idx])
            oof_preds["nb"][val_idx] = nb.predict_proba(X_tfidf_val)
            test_preds_accum["nb"].append((tfidf, nb))

            # ===== 2. Random Forest (Light Stylometry) =====
            rf = RandomForestClassifier(
                n_estimators=200,
                max_depth=30,
                min_samples_split=5,
                random_state=self.random_seed,
                n_jobs=-1
            )
            rf.fit(features_train["light"][train_idx], y_train[train_idx], sample_weight=sample_weights)
            oof_preds["rf"][val_idx] = rf.predict_proba(features_train["light"][val_idx])
            test_preds_accum["rf"].append(rf)

            # ===== 3. XGBoost (TF-IDF + Stylometry) =====
            scaler = StandardScaler(with_mean=False)
            stylo_scaled = scaler.fit_transform(features_train["stylo"].values[train_idx])
            X_combined_tr = sparse.hstack([X_tfidf_tr, stylo_scaled])

            stylo_val_scaled = scaler.transform(features_train["stylo"].values[val_idx])
            X_combined_val = sparse.hstack([X_tfidf_val, stylo_val_scaled])

            xgb = XGBClassifier(
                n_estimators=100,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=self.random_seed,
                eval_metric='mlogloss',
                tree_method='hist',
                enable_categorical=False
            )
            xgb.fit(X_combined_tr, y_train[train_idx], sample_weight=sample_weights, verbose=False)
            oof_preds["xgb"][val_idx] = xgb.predict_proba(X_combined_val)
            test_preds_accum["xgb"].append((scaler, xgb))

            # ===== 4. Gradient Boosting (SBERT) =====
            sbt = GradientBoostingClassifier(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.1,
                random_state=self.random_seed
            )
            sbt.fit(features_train["sbert"][train_idx], y_train[train_idx], sample_weight=sample_weights)
            oof_preds["sbt"][val_idx] = sbt.predict_proba(features_train["sbert"][val_idx])
            test_preds_accum["sbt"].append(sbt)

            # Validation metrics for this fold
            fold_val_preds = np.mean([
                oof_preds["nb"][val_idx],
                oof_preds["rf"][val_idx],
                oof_preds["xgb"][val_idx],
                oof_preds["sbt"][val_idx]
            ], axis=0)
            fold_val_y = np.argmax(fold_val_preds, axis=1)

            fold_acc = accuracy_score(y_train[val_idx], fold_val_y)
            fold_f1 = f1_score(y_train[val_idx], fold_val_y, average='macro')

            fold_metrics.append({
                "fold": fold + 1,
                "accuracy": float(fold_acc),
                "f1_macro": float(fold_f1)
            })

            print(f"  Val Acc: {fold_acc:.4f}, F1: {fold_f1:.4f}")

        return fold_metrics, oof_preds, test_preds_accum

    def _train_final_model(self, oof_preds, y_train, test_preds_accum, y_test, features) -> Tuple:
        """Train final meta-learner on OOF predictions."""
        print("\n" + "="*60)
        print("TRAINING META-LEARNER")
        print("="*60)

        # Stack OOF predictions
        X_meta_train = np.hstack([
            oof_preds["nb"],
            oof_preds["rf"],
            oof_preds["xgb"],
            oof_preds["sbt"]
        ])

        # Add lightweight engineered features
        light_features = np.array([compute_light_stylometry(t) for t in features["train"]["texts"]])
        scaler_final = StandardScaler()
        light_scaled = scaler_final.fit_transform(light_features)
        X_meta_train = np.hstack([X_meta_train, light_scaled])

        print(f"Meta-learner input shape: {X_meta_train.shape}")

        # Train meta-model
        meta_model = LogisticRegression(
            max_iter=1000,
            random_state=self.random_seed,
            class_weight='balanced'
        )
        meta_model.fit(X_meta_train, y_train)

        # Test predictions
        test_preds_nb = []
        test_preds_rf = []
        test_preds_xgb = []
        test_preds_sbt = []

        for fold_idx in range(self.n_folds):
            # NB
            tfidf, nb = test_preds_accum["nb"][fold_idx]
            X_test_tfidf = tfidf.transform(features["test"]["texts"])
            test_preds_nb.append(nb.predict_proba(X_test_tfidf))

            # RF
            rf = test_preds_accum["rf"][fold_idx]
            test_preds_rf.append(rf.predict_proba(features["test"]["light"]))

            # XGB
            scaler, xgb = test_preds_accum["xgb"][fold_idx]
            tfidf_xgb, _ = test_preds_accum["nb"][fold_idx]
            X_test_tfidf_xgb = tfidf_xgb.transform(features["test"]["texts"])
            stylo_test_scaled = scaler.transform(features["test"]["stylo"].values)
            X_test_combined = sparse.hstack([X_test_tfidf_xgb, stylo_test_scaled])
            test_preds_xgb.append(xgb.predict_proba(X_test_combined))

            # SBERT
            sbt = test_preds_accum["sbt"][fold_idx]
            test_preds_sbt.append(sbt.predict_proba(features["test"]["sbert"]))

        # Average fold predictions
        avg_nb = np.mean(test_preds_nb, axis=0)
        avg_rf = np.mean(test_preds_rf, axis=0)
        avg_xgb = np.mean(test_preds_xgb, axis=0)
        avg_sbt = np.mean(test_preds_sbt, axis=0)

        # Meta-learner test input
        X_meta_test = np.hstack([avg_nb, avg_rf, avg_xgb, avg_sbt])
        light_test_scaled = scaler_final.transform(features["test"]["light"])
        X_meta_test = np.hstack([X_meta_test, light_test_scaled])

        # Final predictions
        y_proba = meta_model.predict_proba(X_meta_test)
        y_pred = np.argmax(y_proba, axis=1)

        # Metrics
        acc = accuracy_score(y_test, y_pred)
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        precision, recall, f1_per_class, support = precision_recall_fscore_support(
            y_test, y_pred, average=None, zero_division=0
        )
        conf_mat = confusion_matrix(y_test, y_pred)

        print(f"\n✓ Test Set Performance:")
        print(f"  Accuracy: {acc:.4f}")
        print(f"  F1-Macro: {f1_macro:.4f}")
        print(f"  F1-Weighted: {f1_weighted:.4f}")

        print("\nPer-Class F1 Scores:")
        for i, name in enumerate(self.TARGET_NAMES):
            print(f"  {name}: {f1_per_class[i]:.4f}")

        print("\nConfusion Matrix:")
        print(conf_mat)

        # Save artifacts
        artifacts = {
            "meta_model": meta_model,
            "scaler_final": scaler_final,
            "fold_models": test_preds_accum,
            "y_test": y_test,
            "y_pred": y_pred,
            "y_proba": y_proba
        }
        joblib.dump(artifacts, f"{OUT_DIR}/final_model_artifacts.joblib")
        print(f"\n✓ Saved model artifacts to {OUT_DIR}")

        return ValidationMetrics(
            accuracy=acc,
            f1_macro=f1_macro,
            f1_weighted=f1_weighted,
            precision_per_class=precision,
            recall_per_class=recall,
            f1_per_class=f1_per_class,
            confusion_matrix=conf_mat
        ), y_pred, y_proba

    def _add_bootstrap_ci(self, y_pred, y_test, y_proba, metrics: ValidationMetrics, n_bootstrap: int = 1000) -> ValidationMetrics:
        """Add bootstrap confidence intervals for robust statistics."""
        print("\n" + "="*60)
        print("COMPUTING BOOTSTRAP CONFIDENCE INTERVALS")
        print("="*60)

        n_samples = len(y_test)
        accs = []
        f1s = []

        np.random.seed(self.random_seed)

        for i in tqdm(range(n_bootstrap), desc="Bootstrap"):
            # Resample with replacement
            idx = np.random.choice(n_samples, n_samples, replace=True)
            y_test_boot = y_test[idx]
            y_pred_boot = y_pred[idx]

            # Compute metrics
            acc_boot = accuracy_score(y_test_boot, y_pred_boot)
            f1_boot = f1_score(y_test_boot, y_pred_boot, average='macro')

            accs.append(acc_boot)
            f1s.append(f1_boot)

        # Compute confidence intervals (percentile method)
        ci_95_acc = np.percentile(accs, [2.5, 97.5])
        ci_95_f1 = np.percentile(f1s, [2.5, 97.5])

        metrics.accuracy_std = np.std(accs)
        metrics.f1_macro_std = np.std(f1s)
        metrics.confidence_interval_95 = tuple(ci_95_acc)

        print(f"✓ Bootstrap complete (n={n_bootstrap})")
        print(f"  Accuracy: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
        print(f"  95% CI: [{ci_95_acc[0]:.4f}, {ci_95_acc[1]:.4f}]")
        print(f"  F1-Macro: {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")
        print(f"  95% CI: [{ci_95_f1[0]:.4f}, {ci_95_f1[1]:.4f}]")

        return metrics

    def _ablation_study(self, test_preds_accum, y_test, features) -> Dict:
        """Ablation study: evaluate each base model independently."""
        print("\n" + "="*70)
        print("ABLATION STUDY: Base Models Performance")
        print("="*70)

        ablation_results = {}

        # Average fold predictions for each base model
        for model_name in ["nb", "rf", "xgb", "sbt"]:
            preds_list = []

            for fold_idx in range(self.n_folds):
                if model_name == "nb":
                    tfidf, nb = test_preds_accum[model_name][fold_idx]
                    X_test_transformed = tfidf.transform(features["test"]["texts"])
                    preds_list.append(nb.predict_proba(X_test_transformed))
                elif model_name == "rf":
                    rf = test_preds_accum[model_name][fold_idx]
                    preds_list.append(rf.predict_proba(features["test"]["light"]))
                elif model_name == "xgb":
                    scaler, xgb = test_preds_accum[model_name][fold_idx]
                    tfidf, _ = test_preds_accum["nb"][fold_idx]
                    X_tfidf = tfidf.transform(features["test"]["texts"])
                    stylo_scaled = scaler.transform(features["test"]["stylo"].values)
                    X_combined = sparse.hstack([X_tfidf, stylo_scaled])
                    preds_list.append(xgb.predict_proba(X_combined))
                elif model_name == "sbt":
                    sbt = test_preds_accum[model_name][fold_idx]
                    preds_list.append(sbt.predict_proba(features["test"]["sbert"]))

            # Average predictions
            avg_proba = np.mean(preds_list, axis=0)
            y_pred_base = np.argmax(avg_proba, axis=1)

            # Metrics
            acc = accuracy_score(y_test, y_pred_base)
            f1 = f1_score(y_test, y_pred_base, average='macro')
            precision, recall, f1_per_class, support = precision_recall_fscore_support(
                y_test, y_pred_base, average=None, zero_division=0
            )

            ablation_results[model_name] = {
                'accuracy': float(acc),
                'f1_macro': float(f1),
                'per_class_f1': f1_per_class.tolist()
            }

            print(f"\n{model_name.upper()}:")
            print(f"  Accuracy: {acc:.4f}")
            print(f"  F1-Macro: {f1:.4f}")
            print(classification_report(y_test, y_pred_base, target_names=self.TARGET_NAMES,
                                       digits=4, zero_division=0))

        # Save ablation results
        with open(f"{OUT_DIR}/ablation_results.json", "w") as f:
            json.dump(ablation_results, f, indent=4)

        return ablation_results

    def _save_classification_report(self, y_test, y_pred):
        """Save detailed classification report."""
        report = classification_report(y_test, y_pred, target_names=self.TARGET_NAMES, digits=4)

        with open(f"{OUT_DIR}/classification_report.txt", "w") as f:
            f.write("="*70 + "\n")
            f.write("UNIFIED STACKING ENSEMBLE CLASSIFICATION REPORT\n")
            f.write("="*70 + "\n\n")
            f.write(report)
            f.write("\n\n")
            f.write("Confusion Matrix:\n")
            f.write(str(confusion_matrix(y_test, y_pred)))
            f.write("\n")

        print(f"\n✓ Saved classification report to {OUT_DIR}/classification_report.txt")


# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution."""

    # Build canonical dataset (IDENTICAL to DeBERTa)
    builder = BalancedDatasetBuilder(random_seed=RANDOM_SEED)
    df = builder.build(ai_csv_path="/content/ai_emails.csv")

    # Save dataset info
    dataset_info = {
        "total_samples": len(df),
        "class_distribution": df["label"].value_counts().sort_index().to_dict(),
        "unique_texts": df["text"].nunique(),
        "avg_text_length": float(df["text"].str.len().mean()),
        "target_names": TARGET_NAMES,
        "canonical_columns": list(df.columns),
        "columns_used_for_features": ["text"],
        "columns_preserved_but_unused": ["headers", "html", "attachments", "content_hash"],
        "timestamp": datetime.now().isoformat(),
        "preprocessing": "unified_with_deberta",
        "feature_extraction": "stylometry_tfidf_sbert_different_from_deberta"
    }

    with open(f"{OUT_DIR}/dataset_info.json", "w") as f:
        json.dump(dataset_info, f, indent=4)

    # Train ensemble
    trainer = EnsembleTrainer(random_seed=RANDOM_SEED, n_folds=NUM_FOLDS)
    results = trainer.train(df)

    # Print results
    print("\n" + "="*70)
    print("FINAL RESULTS WITH STATISTICAL RIGOR")
    print("="*70)
    print(results["metrics"].summary())

    print("\n" + "="*70)
    print("FOLD-WISE METRICS")
    print("="*70)
    fold_accs = [m["accuracy"] for m in results["fold_metrics"]]
    fold_f1s = [m["f1_macro"] for m in results["fold_metrics"]]

    print(f"Accuracy: {np.mean(fold_accs):.4f} ± {np.std(fold_accs):.4f}")
    print(f"F1-Macro: {np.mean(fold_f1s):.4f} ± {np.std(fold_f1s):.4f}")

    for i, metrics in enumerate(results["fold_metrics"]):
        print(f"  Fold {i+1}: Acc={metrics['accuracy']:.4f}, F1={metrics['f1_macro']:.4f}")

    print("\n" + "="*70)
    print("ABLATION STUDY SUMMARY")
    print("="*70)
    print("Base model contributions to ensemble:")
    for model, stats in results["ablation"].items():
        print(f"  {model.upper()}: Acc={stats['accuracy']:.4f}, F1={stats['f1_macro']:.4f}")

    print("\n" + "="*70)
    print("UNIFIED PREPROCESSING CONFIRMATION")
    print("="*70)
    print("✓ Same preprocessing as DeBERTa:")
    print("  - EmailText value objects")
    print("  - TextNormalizer with entity masking")
    print("  - Two-pass deduplication")
    print("  - GroupKFold cross-validation")
    print("  - Class weight balancing")
    print("\n✓ Same canonical dataset schema:")
    print("  - Columns: text, headers, html, attachments, label, content_hash")
    print("\n✓ Different feature extraction:")
    print("  - Stacking: TF-IDF + Stylometry + SBERT (from text only)")
    print("  - DeBERTa: Tokenized text + 24 engineered features (headers/html/attachments)")
    print("\n✓ This ensures fair comparison while leveraging each model's strengths!")


if __name__ == "__main__":
    main()


BUILDING CANONICAL DATASET (UNIFIED WITH DEBERTA)
✓ Loaded 854 AI emails
  ✓ Created canonical columns: text, headers, html, attachments


README.md:   0%|          | 0.00/176 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/101M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/6.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31716 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]


Raw counts:
  AI Phishing: 854
  Enron Ham: 15553
  Manual Spam: 15667
  Nigerian Scam: 496

After cleaning & deduplication:
  AI Phishing: 847
  Enron Ham: 14547
  Manual Spam: 13288
  Nigerian Scam: 417
  ⚠ Enron Ham: Capped at 2000
  ⚠ Manual Spam: Capped at 2000

FINAL CANONICAL DATASET SCHEMA
Total samples: 5264

Canonical columns present:
  ✓ text
  ✓ headers
  ✓ html
  ✓ attachments
  ✓ content_hash
  ✓ label

⚠️ LOGGING: Columns after preprocessing (before model training):
  Columns: ['text', 'headers', 'html', 'attachments', 'content_hash', 'label']
  Shape: (5264, 6)

Class distribution:
  0 - AI Phishing: 847
  1 - Enron Ham: 2000
  2 - Manual Spam: 2000
  3 - Nigerian Scam: 417

Class weights (for training):
  AI Phishing: 1.554
  Enron Ham: 0.658
  Manual Spam: 0.658
  Nigerian Scam: 3.156


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


TRAINING STACKING ENSEMBLE
Feature extraction strategy:
  ✓ Text → TF-IDF (for Naive Bayes)
  ✓ Text → Stylometry (enhanced linguistic features)
  ✓ Text → Light Stylometry (fast features for Random Forest)
  ✓ Text → SBERT embeddings (semantic features)
  ✗ Headers/HTML/Attachments → NOT used (different from DeBERTa)

Note: DeBERTa uses headers/html/attachments for 24 engineered features.
      Stacking focuses on textual + stylometric essence instead.

⚠️ LOGGING: Input to model training:
  Using column: 'text' (normalized email content)
  Shape: (5264,)
  Other canonical columns present but not used: headers, html, attachments

✓ Train: 4211, Test: 1053

Test set distribution:
  AI Phishing: 170
  Enron Ham: 400
  Manual Spam: 400
  Nigerian Scam: 83

FEATURE EXTRACTION FROM TEXT COLUMN

1. Enhanced Stylometry...


Stylometry: 100%|██████████| 4211/4211 [01:32<00:00, 45.32it/s]
Stylometry: 100%|██████████| 1053/1053 [00:21<00:00, 48.45it/s]



2. Light Stylometry...


Light: 100%|██████████| 4211/4211 [00:00<00:00, 6439.81it/s]
Light: 100%|██████████| 1053/1053 [00:00<00:00, 7629.93it/s]



3. SBERT Embeddings...


Batches:   0%|          | 0/132 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]


✓ Feature extraction complete:
  Stylometry: (4211, 19)
  Light Stylometry: (4211, 9)
  SBERT: (4211, 768)

CROSS-VALIDATION (5 FOLDS WITH CLASS WEIGHTS)

✓ Created 5 GroupKFold splits
  Unique groups: 4211

Fold 1/5
  Train: 3368, Val: 843
  Val Acc: 0.9798, F1: 0.9794

Fold 2/5
  Train: 3369, Val: 842
  Val Acc: 0.9691, F1: 0.9524

Fold 3/5
  Train: 3369, Val: 842
  Val Acc: 0.9798, F1: 0.9818

Fold 4/5
  Train: 3369, Val: 842
  Val Acc: 0.9786, F1: 0.9712

Fold 5/5
  Train: 3369, Val: 842
  Val Acc: 0.9715, F1: 0.9589

TRAINING META-LEARNER
Meta-learner input shape: (4211, 25)

✓ Test Set Performance:
  Accuracy: 0.9924
  F1-Macro: 0.9938
  F1-Weighted: 0.9924

Per-Class F1 Scores:
  AI Phishing: 1.0000
  Enron Ham: 0.9913
  Manual Spam: 0.9899
  Nigerian Scam: 0.9939

Confusion Matrix:
[[170   0   0   0]
 [  0 399   1   0]
 [  0   6 394   0]
 [  0   0   1  82]]

✓ Saved model artifacts to /content/paper_artifacts_unified_stacking/outputs

COMPUTING BOOTSTRAP CONFIDENCE INTERVALS


Bootstrap: 100%|██████████| 1000/1000 [00:01<00:00, 519.27it/s]


✓ Bootstrap complete (n=1000)
  Accuracy: 0.9923 ± 0.0027
  95% CI: [0.9867, 0.9972]
  F1-Macro: 0.9937 ± 0.0025
  95% CI: [0.9887, 0.9981]

ABLATION STUDY: Base Models Performance

NB:
  Accuracy: 0.9668
  F1-Macro: 0.9572
               precision    recall  f1-score   support

  AI Phishing     0.9942    1.0000    0.9971       170
    Enron Ham     0.9824    0.9750    0.9787       400
  Manual Spam     0.9742    0.9425    0.9581       400
Nigerian Scam     0.8265    0.9759    0.8950        83

     accuracy                         0.9668      1053
    macro avg     0.9443    0.9734    0.9572      1053
 weighted avg     0.9689    0.9668    0.9672      1053


RF:
  Accuracy: 0.8186
  F1-Macro: 0.8352
               precision    recall  f1-score   support

  AI Phishing     1.0000    1.0000    1.0000       170
    Enron Ham     0.7597    0.8300    0.7933       400
  Manual Spam     0.8212    0.7350    0.7757       400
Nigerian Scam     0.7500    0.7952    0.7719        83

     accuracy

In [None]:
import shutil
from google.colab import files

# Path to your directory
dir_path = '/content/paper_artifacts_unified_stacking'

# Create a zip archive
shutil.make_archive('paper_artifacts_unified_stacking', 'zip', dir_path)

# Download the zip file
files.download('paper_artifacts_unified_stacking.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
"""
Generate Performance and Ablation Graphs from Saved Results
Auto-saves to figures directory
"""

import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from sklearn.metrics import accuracy_score, f1_score

# Configuration
BASE_DIR = "/content/paper_artifacts_unified_stacking"
OUT_DIR = f"{BASE_DIR}/outputs"
FIGURES_DIR = f"{BASE_DIR}/figures"
os.makedirs(FIGURES_DIR, exist_ok=True)

COLORS = {
    'stacking': '#A23B72',
    'warning': '#F18F01',
    'neutral': '#6C757D',
}

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['legend.fontsize'] = 9

TARGET_NAMES = ["AI Phishing", "Enron Ham", "Manual Spam", "Nigerian Scam"]

print("\n" + "="*70)
print("LOADING TRAINING RESULTS")
print("="*70)

# Load artifacts
artifacts = joblib.load(f"{OUT_DIR}/final_model_artifacts.joblib")
y_test = artifacts['y_test']
y_pred = artifacts['y_pred']
y_proba = artifacts['y_proba']

# Load ablation results
with open(f"{OUT_DIR}/ablation_results.json", "r") as f:
    ablation = json.load(f)

# Load dataset info
with open(f"{OUT_DIR}/dataset_info.json", "r") as f:
    dataset_info = json.load(f)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')

print(f"✓ Loaded results")
print(f"  Test accuracy: {accuracy:.4f}")
print(f"  F1-macro: {f1_macro:.4f}")
print(f"  Test set size: {len(y_test)}")

# Try to load fold predictions for error bars
try:
    all_fold_preds = np.load(f"{OUT_DIR}/all_fold_predictions.npy")
    # Calculate fold-wise metrics
    n_folds = all_fold_preds.shape[0]
    fold_accs = []
    fold_f1s = []
    for fold in range(n_folds):
        y_pred_fold = np.argmax(all_fold_preds[fold], axis=1)
        fold_accs.append(accuracy_score(y_test, y_pred_fold))
        fold_f1s.append(f1_score(y_test, y_pred_fold, average='macro'))
    accuracy_std = np.std(fold_accs)
    f1_std = np.std(fold_f1s)
    has_folds = True
    print(f"✓ Loaded fold predictions")
    print(f"  Fold std - Acc: ±{accuracy_std:.4f}, F1: ±{f1_std:.4f}")
except:
    accuracy_std = 0.0027  # From bootstrap CI in log
    f1_std = 0.0025
    has_folds = False
    print(f"⚠ Using bootstrap std from training log")

print("\n" + "="*70)
print("GENERATING GRAPHS")
print("="*70)

# =============================================================================
# FIGURE 2: PERFORMANCE COMPARISON
# =============================================================================

print("\nGenerating Figure 2: Performance Comparison...")

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

models = ['Stacking\nEnsemble']
accuracy_vals = [accuracy]
f1_vals = [f1_macro]

# Plot 1: Accuracy
ax1 = axes[0]
bars1 = ax1.bar(models, accuracy_vals, yerr=[accuracy_std], capsize=8,
                color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8,
                error_kw={'linewidth': 2, 'ecolor': 'black'})

ax1.set_ylabel('Accuracy', fontweight='bold')
ax1.set_ylim([0.95, 1.0])
ax1.set_title('(a) Test Set Accuracy', fontweight='bold', pad=10)
ax1.grid(axis='y', alpha=0.3, linestyle='--')
ax1.set_axisbelow(True)

# Add value label
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + accuracy_std + 0.002,
            f'{accuracy:.4f}\n±{accuracy_std:.4f}',
            ha='center', va='bottom', fontweight='bold', fontsize=10)

# Plot 2: F1-Macro
ax2 = axes[1]
bars2 = ax2.bar(models, f1_vals, yerr=[f1_std], capsize=8,
                color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8,
                error_kw={'linewidth': 2, 'ecolor': 'black'})

ax2.set_ylabel('F1-Score (Macro)', fontweight='bold')
ax2.set_ylim([0.95, 1.0])
ax2.set_title('(b) Test Set F1-Macro Score', fontweight='bold', pad=10)
ax2.grid(axis='y', alpha=0.3, linestyle='--')
ax2.set_axisbelow(True)

# Add value label
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + f1_std + 0.002,
            f'{f1_macro:.4f}\n±{f1_std:.4f}',
            ha='center', va='bottom', fontweight='bold', fontsize=10)

n_samples = dataset_info.get('total_samples', len(y_test))
fig.text(0.5, 0.02, f'Trained on identical preprocessed dataset (n={n_samples:,})',
        ha='center', fontsize=9, style='italic',
        bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='orange', linewidth=1))

plt.tight_layout(rect=[0, 0.06, 1, 1])
fig.savefig(f'{FIGURES_DIR}/Figure_2_Performance_Comparison.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.close(fig)
print("✓ Saved Figure_2_Performance_Comparison.png")

# =============================================================================
# FIGURE 6: ABLATION STUDY
# =============================================================================

print("\nGenerating Figure 6: Ablation Study...")

fig, ax = plt.subplots(figsize=(10, 6))

# Model names
models = [
    'Naive Bayes\n(TF-IDF)',
    'Random Forest\n(Light Stylo)',
    'XGBoost\n(TF-IDF + Stylo)',
    'Gradient Boosting\n(SBERT)',
    'Meta-Learner\n(Ensemble)'
]

# Extract metrics from ablation results
model_keys = ['nb', 'rf', 'xgb', 'sbt']
accuracy_list = []
f1_list = []

for key in model_keys:
    accuracy_list.append(ablation[key]['accuracy'])
    f1_list.append(ablation[key]['f1_macro'])

# Add ensemble (meta-learner) results
accuracy_list.append(accuracy)
f1_list.append(f1_macro)

print(f"  Ablation metrics loaded:")
for i, (model, acc, f1) in enumerate(zip(models, accuracy_list, f1_list)):
    print(f"    {model.replace(chr(10), ' ')}: Acc={acc:.4f}, F1={f1:.4f}")

# Create horizontal bar chart
y_pos = np.arange(len(models))
bar_height = 0.35

bars1 = ax.barh(y_pos + bar_height/2, accuracy_list, bar_height,
                label='Accuracy',
                color=COLORS['stacking'], edgecolor='black',
                linewidth=1.5, alpha=0.8)

bars2 = ax.barh(y_pos - bar_height/2, f1_list, bar_height,
                label='F1-Macro',
                color=COLORS['warning'], edgecolor='black',
                linewidth=1.5, alpha=0.8)

ax.set_xlabel('Score', fontweight='bold', fontsize=11)
ax.set_title('Ablation Study: Base Model Performance',
             fontweight='bold', fontsize=12, pad=15)
ax.set_yticks(y_pos)
ax.set_yticklabels(models)
ax.set_xlim([0.80, 1.0])
ax.legend(loc='lower right', frameon=True, shadow=True, fontsize=10)
ax.grid(axis='x', alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

# Add value labels
for bars, values in [(bars1, accuracy_list), (bars2, f1_list)]:
    for bar, val in zip(bars, values):
        width = bar.get_width()
        ax.text(width + 0.003, bar.get_y() + bar.get_height()/2.,
               f'{val:.4f}',
               ha='left', va='center', fontsize=9, fontweight='bold')

# Add note
fig.text(0.5, 0.02,
         'Meta-learner combines predictions from all base models',
         ha='center', fontsize=9, style='italic',
         bbox=dict(boxstyle='round', facecolor='lightyellow',
                   edgecolor='orange', linewidth=1))

plt.tight_layout(rect=[0, 0.06, 1, 1])
fig.savefig(f'{FIGURES_DIR}/Figure_6_Ablation_Study.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.close(fig)
print("✓ Saved Figure_6_Ablation_Study.png")

print("\n" + "="*70)
print("GRAPH GENERATION COMPLETE")
print("="*70)
print(f"\n✓ Figures saved to: {FIGURES_DIR}")
print(f"\nGenerated:")
print(f"  • Figure_2_Performance_Comparison.png")
print(f"  • Figure_6_Ablation_Study.png")
print("\nTo generate remaining figures, run:")
print("  python generate_graphs_from_results.py")
print("="*70)


LOADING TRAINING RESULTS
✓ Loaded results
  Test accuracy: 0.9924
  F1-macro: 0.9938
  Test set size: 1053
⚠ Using bootstrap std from training log

GENERATING GRAPHS

Generating Figure 2: Performance Comparison...
✓ Saved Figure_2_Performance_Comparison.png

Generating Figure 6: Ablation Study...
  Ablation metrics loaded:
    Naive Bayes (TF-IDF): Acc=0.9668, F1=0.9572
    Random Forest (Light Stylo): Acc=0.8186, F1=0.8352
    XGBoost (TF-IDF + Stylo): Acc=0.9734, F1=0.9825
    Gradient Boosting (SBERT): Acc=0.9468, F1=0.9385
    Meta-Learner (Ensemble): Acc=0.9924, F1=0.9938
✓ Saved Figure_6_Ablation_Study.png

GRAPH GENERATION COMPLETE

✓ Figures saved to: /content/paper_artifacts_unified_stacking/figures

Generated:
  • Figure_2_Performance_Comparison.png
  • Figure_6_Ablation_Study.png

To generate remaining figures, run:
  python generate_graphs_from_results.py


In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, f1_score

# =============================================================================
# CONFIGURATION
# =============================================================================

BASE_DIR = "/content/paper_artifacts_unified_stacking"
OUT_DIR = f"{BASE_DIR}/outputs"
FIGURES_DIR = f"{BASE_DIR}/figures"

os.makedirs(FIGURES_DIR, exist_ok=True)

COLORS = {
    'stacking': '#A23B72',
    'deberta': '#2E86AB',
    'success': '#06A77D',
    'warning': '#F18F01',
    'neutral': '#6C757D',
}

plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['legend.fontsize'] = 9

TARGET_NAMES = ["AI Phishing", "Enron Ham", "Manual Spam", "Nigerian Scam"]

# =============================================================================
# VERIFY AND LOAD RESULTS
# =============================================================================

def check_required_files():
    """Check if all required files exist."""
    required_files = {
        'model_artifacts': f"{OUT_DIR}/final_model_artifacts.joblib",
        'dataset_info': f"{OUT_DIR}/dataset_info.json",
        'ablation': f"{OUT_DIR}/ablation_results.json",
    }

    missing = []
    for name, path in required_files.items():
        if not os.path.exists(path):
            missing.append((name, path))

    if missing:
        print("\n" + "="*70)
        print("ERROR: MISSING REQUIRED FILES")
        print("="*70)
        print("\nThe following files are required but not found:")
        for name, path in missing:
            print(f"  ✗ {name}: {path}")
        print("\nPlease run the training script first:")
        print("  python unified_stacking_ensemble.py")
        print("\nThen run this script to generate graphs.")
        print("="*70)
        return False

    return True


def load_results():
    """Load all training results from saved files."""
    print("\n" + "="*70)
    print("LOADING TRAINING RESULTS")
    print("="*70)

    results = {}

    # Load model artifacts
    print("\nLoading artifacts...")
    artifacts = joblib.load(f"{OUT_DIR}/final_model_artifacts.joblib")
    results['y_test'] = artifacts['y_test']
    results['y_pred'] = artifacts['y_pred']
    results['y_proba'] = artifacts['y_proba']
    print(f"  ✓ Test set size: {len(results['y_test'])}")

    # Load dataset info
    with open(f"{OUT_DIR}/dataset_info.json", "r") as f:
        results['dataset_info'] = json.load(f)
    print(f"  ✓ Total samples: {results['dataset_info']['total_samples']}")

    # Load ablation results
    with open(f"{OUT_DIR}/ablation_results.json", "r") as f:
        results['ablation'] = json.load(f)
    print(f"  ✓ Ablation models: {len(results['ablation'])}")

    # Calculate metrics from predictions
    print("\nCalculating metrics from predictions...")
    results['metrics'] = calculate_metrics(results['y_test'], results['y_pred'])
    print(f"  ✓ Accuracy: {results['metrics']['accuracy']:.4f}")
    print(f"  ✓ F1-Macro: {results['metrics']['f1_macro']:.4f}")

    # Try to load fold predictions for cross-validation graphs
    try:
        results['all_fold_predictions'] = np.load(f"{OUT_DIR}/all_fold_predictions.npy")
        print(f"  ✓ Fold predictions loaded: {results['all_fold_predictions'].shape[0]} folds")
        results['has_fold_data'] = True
    except FileNotFoundError:
        print("  ⚠ Fold predictions not found - CV graphs will be skipped")
        results['has_fold_data'] = False

    print("\n" + "="*70)
    return results


def calculate_metrics(y_test, y_pred):
    """Calculate all metrics from predictions."""
    from sklearn.metrics import confusion_matrix

    metrics = {}
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['f1_macro'] = f1_score(y_test, y_pred, average='macro')
    metrics['f1_weighted'] = f1_score(y_test, y_pred, average='weighted')

    precision, recall, f1_per_class, support = precision_recall_fscore_support(
        y_test, y_pred, average=None, zero_division=0
    )

    metrics['precision_per_class'] = precision
    metrics['recall_per_class'] = recall
    metrics['f1_per_class'] = f1_per_class
    metrics['support'] = support
    metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)

    return metrics


def calculate_fold_metrics(y_test, all_fold_predictions):
    """Calculate fold-wise metrics from predictions."""
    n_folds = all_fold_predictions.shape[0]
    fold_metrics = []

    for fold in range(n_folds):
        fold_probs = all_fold_predictions[fold]
        y_pred_fold = np.argmax(fold_probs, axis=1)

        acc = accuracy_score(y_test, y_pred_fold)
        f1 = f1_score(y_test, y_pred_fold, average='macro')

        fold_metrics.append({
            'fold': fold + 1,
            'accuracy': float(acc),
            'f1_macro': float(f1)
        })

    return fold_metrics


# =============================================================================
# GRAPH GENERATION (ONLY FROM ACTUAL DATA)
# =============================================================================

def generate_figure_2(results):
    """Figure 2: Performance comparison."""
    print("\nGenerating Figure 2: Performance Comparison...")

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    metrics = results['metrics']

    # Calculate std from fold predictions if available
    if results['has_fold_data']:
        fold_metrics = calculate_fold_metrics(results['y_test'], results['all_fold_predictions'])
        fold_accs = [m['accuracy'] for m in fold_metrics]
        fold_f1s = [m['f1_macro'] for m in fold_metrics]
        accuracy_std = np.std(fold_accs)
        f1_std = np.std(fold_f1s)
        print(f"  Using actual CV std: ±{accuracy_std:.4f} (acc), ±{f1_std:.4f} (f1)")
    else:
        print("  ⚠ No fold data - skipping error bars")
        accuracy_std = 0
        f1_std = 0

    models = ['Stacking\nEnsemble']
    accuracy = [metrics['accuracy']]
    f1_macro = [metrics['f1_macro']]

    # Plot 1: Accuracy
    ax1 = axes[0]
    if accuracy_std > 0:
        bars1 = ax1.bar(models, accuracy, yerr=[accuracy_std], capsize=8,
                        color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8,
                        error_kw={'linewidth': 2, 'ecolor': 'black'})
    else:
        bars1 = ax1.bar(models, accuracy,
                        color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8)

    ax1.set_ylabel('Accuracy', fontweight='bold')
    ax1.set_ylim([max(0.9, accuracy[0] - 0.05), 1.0])
    ax1.set_title('(a) Test Set Accuracy', fontweight='bold', pad=10)
    ax1.grid(axis='y', alpha=0.3, linestyle='--')
    ax1.set_axisbelow(True)

    for bar, acc in zip(bars1, accuracy):
        height = bar.get_height()
        if accuracy_std > 0:
            ax1.text(bar.get_x() + bar.get_width()/2., height + accuracy_std + 0.003,
                    f'{acc:.3f}\n±{accuracy_std:.3f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=9)
        else:
            ax1.text(bar.get_x() + bar.get_width()/2., height + 0.003,
                    f'{acc:.3f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=9)

    # Plot 2: F1-Macro
    ax2 = axes[1]
    if f1_std > 0:
        bars2 = ax2.bar(models, f1_macro, yerr=[f1_std], capsize=8,
                        color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8,
                        error_kw={'linewidth': 2, 'ecolor': 'black'})
    else:
        bars2 = ax2.bar(models, f1_macro,
                        color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8)

    ax2.set_ylabel('F1-Score (Macro)', fontweight='bold')
    ax2.set_ylim([max(0.9, f1_macro[0] - 0.05), 1.0])
    ax2.set_title('(b) Test Set F1-Macro Score', fontweight='bold', pad=10)
    ax2.grid(axis='y', alpha=0.3, linestyle='--')
    ax2.set_axisbelow(True)

    for bar, f1 in zip(bars2, f1_macro):
        height = bar.get_height()
        if f1_std > 0:
            ax2.text(bar.get_x() + bar.get_width()/2., height + f1_std + 0.003,
                    f'{f1:.3f}\n±{f1_std:.3f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=9)
        else:
            ax2.text(bar.get_x() + bar.get_width()/2., height + 0.003,
                    f'{f1:.3f}',
                    ha='center', va='bottom', fontweight='bold', fontsize=9)

    n_samples = results['dataset_info']['total_samples']
    fig.text(0.5, 0.02, f'Trained on identical preprocessed dataset (n={n_samples:,})',
            ha='center', fontsize=9, style='italic',
            bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='orange', linewidth=1))

    plt.tight_layout(rect=[0, 0.06, 1, 1])
    fig.savefig(f'{FIGURES_DIR}/Figure_2_Performance_Comparison.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_2_Performance_Comparison.png")


def generate_figure_3(results):
    """Figure 3: Per-class F1 scores."""
    print("\nGenerating Figure 3: Per-Class F1 Scores...")

    fig, ax = plt.subplots(figsize=(10, 6))

    f1_per_class = results['metrics']['f1_per_class']

    classes = ['AI\nPhishing', 'Enron\nHam', 'Manual\nSpam', 'Nigerian\nScam']
    x = np.arange(len(classes))
    width = 0.6

    bars = ax.bar(x, f1_per_class, width,
                  color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8)

    ax.set_xlabel('Email Class', fontweight='bold', fontsize=11)
    ax.set_ylabel('F1-Score', fontweight='bold', fontsize=11)
    ax.set_title('Per-Class F1-Score Performance', fontweight='bold', fontsize=12, pad=15)
    ax.set_xticks(x)
    ax.set_xticklabels(classes)
    ax.set_ylim([max(0.85, min(f1_per_class) - 0.05), 1.0])
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)

    for bar, f1_val in zip(bars, f1_per_class):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.005,
               f'{f1_val:.3f}',
               ha='center', va='bottom', fontsize=9, fontweight='bold')

    plt.tight_layout()
    fig.savefig(f'{FIGURES_DIR}/Figure_3_Per_Class_F1.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_3_Per_Class_F1.png")


def generate_figure_4(results):
    """Figure 4: Confusion matrix."""
    print("\nGenerating Figure 4: Confusion Matrix...")

    fig, ax = plt.subplots(figsize=(8, 7))

    cm = results['metrics']['confusion_matrix']
    classes = ['AI\nPhishing', 'Enron\nHam', 'Manual\nSpam', 'Nigerian\nScam']

    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples',
                xticklabels=classes, yticklabels=classes, ax=ax,
                cbar_kws={'label': 'Count'}, linewidths=0.5, linecolor='gray')
    ax.set_title('Confusion Matrix - Stacking Ensemble', fontweight='bold', fontsize=12, pad=10)
    ax.set_ylabel('True Label', fontweight='bold')
    ax.set_xlabel('Predicted Label', fontweight='bold')

    plt.tight_layout()
    fig.savefig(f'{FIGURES_DIR}/Figure_4_Confusion_Matrix.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_4_Confusion_Matrix.png")


def generate_figure_5(results):
    """Figure 5: Cross-validation performance (only if fold data exists)."""
    if not results['has_fold_data']:
        print("\nSkipping Figure 5: No fold prediction data available")
        print("  To generate this figure, ensure all_fold_predictions.npy is saved during training")
        return

    print("\nGenerating Figure 5: Cross-Validation Performance...")

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    fold_metrics = calculate_fold_metrics(results['y_test'], results['all_fold_predictions'])

    folds = np.arange(1, len(fold_metrics) + 1)
    accs = [m['accuracy'] for m in fold_metrics]
    f1s = [m['f1_macro'] for m in fold_metrics]

    # Plot Accuracy
    ax1 = axes[0]
    ax1.plot(folds, accs, marker='o', linewidth=2.5, markersize=8,
            color=COLORS['stacking'], label='Stacking Ensemble')
    ax1.fill_between(folds, accs, alpha=0.2, color=COLORS['stacking'])
    ax1.set_xlabel('Fold Number', fontweight='bold')
    ax1.set_ylabel('Validation Accuracy', fontweight='bold')
    ax1.set_title('(a) 5-Fold Cross-Validation: Accuracy', fontweight='bold', pad=10)
    ax1.set_xticks(folds)
    ax1.set_ylim([max(0.9, min(accs) - 0.02), min(1.0, max(accs) + 0.02)])
    ax1.legend(loc='lower right', frameon=True, shadow=True)
    ax1.grid(True, alpha=0.3, linestyle='--')

    # Plot F1-Macro
    ax2 = axes[1]
    ax2.plot(folds, f1s, marker='o', linewidth=2.5, markersize=8,
            color=COLORS['stacking'], label='Stacking Ensemble')
    ax2.fill_between(folds, f1s, alpha=0.2, color=COLORS['stacking'])
    ax2.set_xlabel('Fold Number', fontweight='bold')
    ax2.set_ylabel('Validation F1-Macro', fontweight='bold')
    ax2.set_title('(b) 5-Fold Cross-Validation: F1-Macro', fontweight='bold', pad=10)
    ax2.set_xticks(folds)
    ax2.set_ylim([max(0.9, min(f1s) - 0.02), min(1.0, max(f1s) + 0.02)])
    ax2.legend(loc='lower right', frameon=True, shadow=True)
    ax2.grid(True, alpha=0.3, linestyle='--')

    fig.text(0.5, 0.02, 'GroupKFold cross-validation (grouped by content hash) prevents data leakage',
            ha='center', fontsize=9, style='italic',
            bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='orange', linewidth=1))

    plt.tight_layout(rect=[0, 0.06, 1, 1])
    fig.savefig(f'{FIGURES_DIR}/Figure_5_Cross_Validation.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_5_Cross_Validation.png")


def generate_figure_6(results):
    """Figure 6: Ablation study."""
    print("\nGenerating Figure 6: Ablation Study...")

    fig, ax = plt.subplots(figsize=(10, 6))

    ablation = results['ablation']

    models = ['Naive Bayes\n(TF-IDF)', 'Random Forest\n(Light Stylo)',
              'XGBoost\n(TF-IDF + Stylo)', 'Gradient Boosting\n(SBERT)',
              'Meta-Learner\n(Ensemble)']

    model_keys = ['nb', 'rf', 'xgb', 'sbt']
    accuracy = [ablation[k]['accuracy'] for k in model_keys]
    f1_macro = [ablation[k]['f1_macro'] for k in model_keys]

    # Add meta-learner (final ensemble) results
    accuracy.append(results['metrics']['accuracy'])
    f1_macro.append(results['metrics']['f1_macro'])

    y = np.arange(len(models))
    height = 0.35

    bars1 = ax.barh(y + height/2, accuracy, height, label='Accuracy',
                    color=COLORS['stacking'], edgecolor='black', linewidth=1.5, alpha=0.8)
    bars2 = ax.barh(y - height/2, f1_macro, height, label='F1-Macro',
                    color=COLORS['warning'], edgecolor='black', linewidth=1.5, alpha=0.8)

    ax.set_xlabel('Score', fontweight='bold', fontsize=11)
    ax.set_title('Ablation Study: Base Model Performance', fontweight='bold', fontsize=12, pad=15)
    ax.set_yticks(y)
    ax.set_yticklabels(models)
    ax.set_xlim([max(0.85, min(min(accuracy), min(f1_macro)) - 0.02), 1.0])
    ax.legend(loc='lower right', frameon=True, shadow=True)
    ax.grid(axis='x', alpha=0.3, linestyle='--')
    ax.set_axisbelow(True)

    for bars in [bars1, bars2]:
        for bar in bars:
            width = bar.get_width()
            ax.text(width + 0.002, bar.get_y() + bar.get_height()/2.,
                   f'{width:.3f}',
                   ha='left', va='center', fontsize=8, fontweight='bold')

    plt.tight_layout()
    fig.savefig(f'{FIGURES_DIR}/Figure_6_Ablation_Study.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_6_Ablation_Study.png")


def generate_figure_8(results):
    """Figure 8: Dataset composition."""
    print("\nGenerating Figure 8: Dataset Composition...")

    fig, ax = plt.subplots(figsize=(8, 6))

    # Get actual class distribution
    class_dist = results['dataset_info']['class_distribution']

    # Handle both string and int keys
    if isinstance(list(class_dist.keys())[0], str):
        class_dist = {int(k): v for k, v in class_dist.items()}

    classes = ['AI Phishing', 'Enron Ham', 'Manual Spam', 'Nigerian Scam']
    counts = [class_dist[i] for i in range(4)]

    # Pie chart
    colors_pie = [COLORS['warning'], COLORS['success'], COLORS['stacking'], COLORS['deberta']]
    explode = (0.05, 0.05, 0.05, 0.05)

    wedges, texts, autotexts = ax.pie(counts, labels=classes, autopct='%1.1f%%',
                                        colors=colors_pie, explode=explode,
                                        shadow=True, startangle=90,
                                        textprops={'fontweight': 'bold'})

    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontsize(10)

    ax.set_title('Final Dataset Distribution', fontweight='bold', fontsize=12, pad=20)

    total = sum(counts)
    fig.text(0.5, 0.05, f'Total Samples: {total:,}', ha='center', fontsize=11,
            bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='orange', linewidth=1.5))

    # Add counts as text below
    count_text = "\n".join([f"{name}: {count:,}" for name, count in zip(classes, counts)])
    fig.text(0.5, 0.01, count_text, ha='center', fontsize=9, style='italic')

    plt.tight_layout(rect=[0, 0.12, 1, 1])
    fig.savefig(f'{FIGURES_DIR}/Figure_8_Dataset_Composition.png', dpi=300, bbox_inches='tight')
    plt.close(fig)
    print("  ✓ Saved Figure_8_Dataset_Composition.png")


# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Main execution."""
    print("\n" + "="*70)
    print("SPRINGER-QUALITY GRAPH GENERATOR")
    print("Generates figures from ACTUAL training results ONLY")
    print("="*70)

    # Check if required files exist
    if not check_required_files():
        sys.exit(1)

    # Load results
    results = load_results()

    # Generate figures
    print("\n" + "="*70)
    print("GENERATING PUBLICATION FIGURES")
    print("="*70)

    generate_figure_2(results)
    generate_figure_3(results)
    generate_figure_4(results)
    generate_figure_5(results)  # Will skip if no fold data
    generate_figure_6(results)
    generate_figure_8(results)

    print("\n" + "="*70)
    print("GRAPH GENERATION COMPLETE")
    print("="*70)
    print(f"\n✓ All figures saved to: {FIGURES_DIR}")
    print("\nGenerated figures:")
    for fname in sorted(os.listdir(FIGURES_DIR)):
        if fname.endswith('.png'):
            print(f"  • {fname}")

    print("\n" + "="*70)
    print("READY FOR PUBLICATION!")
    print("All graphs generated from actual experimental results.")
    print("No dummy data or hardcoded values used.")
    print("="*70)


if __name__ == "__main__":
    main()


SPRINGER-QUALITY GRAPH GENERATOR
Generates figures from ACTUAL training results ONLY

LOADING TRAINING RESULTS

Loading artifacts...
  ✓ Test set size: 1053
  ✓ Total samples: 5264
  ✓ Ablation models: 4

Calculating metrics from predictions...
  ✓ Accuracy: 0.9924
  ✓ F1-Macro: 0.9938
  ⚠ Fold predictions not found - CV graphs will be skipped


GENERATING PUBLICATION FIGURES

Generating Figure 2: Performance Comparison...
  ⚠ No fold data - skipping error bars
  ✓ Saved Figure_2_Performance_Comparison.png

Generating Figure 3: Per-Class F1 Scores...
  ✓ Saved Figure_3_Per_Class_F1.png

Generating Figure 4: Confusion Matrix...
  ✓ Saved Figure_4_Confusion_Matrix.png

Skipping Figure 5: No fold prediction data available
  To generate this figure, ensure all_fold_predictions.npy is saved during training

Generating Figure 6: Ablation Study...
  ✓ Saved Figure_6_Ablation_Study.png

Generating Figure 8: Dataset Composition...
  ✓ Saved Figure_8_Dataset_Composition.png

GRAPH GENERATION CO