# HN Success Predictor V3.2 - Clean Architecture

**Architecture:** RoBERTa + Label-Smoothed CrossEntropy ‚Üí Isotonic Calibration

**Key Changes from V3.1 (based on statistical analysis):**

### Problem Diagnosis
1. **Data Contamination**: V3.1 mislabeled medium posts (30-50 pts ‚Üí label=1), corrupting the decision boundary
2. **Focal Loss Overcorrection**: Œ±=0.25 caused 3.5x more FN than FP (760 vs 219) - model too conservative
3. **Stacking Complexity**: LightGBM added noise; RoBERTa dominated at 84.4% importance anyway
4. **V1 Outperformed V3.1**: Simpler DistilBERT (ROC 0.77) beat complex stacking (ROC 0.72)

### V3.2 Design Principles
- **Clean Labels**: Strict binary labeling (‚â•100 pts = 1, else = 0) - no medium-post contamination
- **V1-style Data Distribution**: High (100+), Medium (20-99‚Üí0), Low (1-19‚Üí0) for proper boundary learning
- **Standard Loss**: Label-smoothed CrossEntropy with inverse-frequency class weights (statistically principled)
- **No Stacking**: Pure RoBERTa end-to-end training (Occam's razor)
- **Keep Isotonic Calibration**: V3.1's ECE=0.01 was genuinely excellent

### Expected Improvements
| Metric | V3.1 | V3.2 Target |
|--------|------|-------------|
| ROC AUC | 0.715 | **‚â•0.77** (match V1) |
| Precision | 0.589 | **‚â•0.62** |
| ECE | 0.013 | **‚â§0.05** |

In [None]:
!nvidia-smi

In [None]:
!pip install -q transformers datasets accelerate scikit-learn pandas tqdm lightgbm seaborn joblib

In [None]:
import requests
import time
import json
import re
import math
import os
from datetime import datetime, timedelta
from urllib.parse import urlparse
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, precision_recall_curve, roc_curve,
    average_precision_score
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression
from datasets import Dataset
import lightgbm as lgb

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Configuration

In [None]:
# =============================================================================
# V3.2 CONFIGURATION - Statistically Principled Settings
# =============================================================================

# Model configuration
MODEL_NAME = "roberta-base"  # Keep RoBERTa for fair comparison; can try distilbert-base-uncased
MAX_LENGTH = 128  # Sufficient for HN titles (avg ~50 chars)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5  # Standard for transformer fine-tuning
WEIGHT_DECAY = 0.01  # L2 regularization
EPOCHS = 4  # Increased from 3; early stopping will prevent overfitting
WARMUP_RATIO = 0.1  # Linear warmup for first 10% of training
SEED = 42

# Label smoothing (reduces overconfidence, improves calibration)
# Interpretation: "I'm 90% sure of the label, 10% uncertain"
LABEL_SMOOTHING = 0.1

# Data settings - V1-style distribution for proper boundary learning
POSTS_PER_CATEGORY = 25000  # Balanced across categories
HIT_THRESHOLD = 100  # Clear, unambiguous threshold

# Reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("=" * 60)
print("V3.2 Configuration")
print("=" * 60)
print(f"  Model: {MODEL_NAME}")
print(f"  Loss: CrossEntropy + Label Smoothing ({LABEL_SMOOTHING})")
print(f"  Epochs: {EPOCHS} (with early stopping)")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Hit Threshold: ‚â•{HIT_THRESHOLD} points")
print(f"  Posts per category: {POSTS_PER_CATEGORY:,}")
print("=" * 60)

## 3. Data Fetching

In [None]:
def fetch_hn_posts(min_points, max_points, target_count, months_back=24):
    """Fetch HN posts from Algolia API with time-based pagination."""
    base_url = "https://hn.algolia.com/api/v1/search_by_date"
    all_posts = []
    seen_ids = set()

    if max_points:
        points_filter = f"points>={min_points},points<={max_points}"
        desc = f"{min_points}-{max_points} pts"
    else:
        points_filter = f"points>={min_points}"
        desc = f"{min_points}+ pts"

    end_date = datetime.now()
    pbar = tqdm(range(months_back), desc=f"Fetching {desc}")

    for month_offset in pbar:
        month_end = end_date - timedelta(days=30 * month_offset)
        month_start = month_end - timedelta(days=30)
        
        # Paginate within each month (up to 10 pages of 1000)
        for page in range(10):
            if len(all_posts) >= target_count:
                break
                
            params = {
                "tags": "story",
                "numericFilters": f"{points_filter},created_at_i>={int(month_start.timestamp())},created_at_i<={int(month_end.timestamp())}",
                "hitsPerPage": 1000,
                "page": page,
            }

            try:
                resp = requests.get(base_url, params=params, timeout=30)
                resp.raise_for_status()
                data = resp.json()
                hits = data.get("hits", [])
                
                if not hits:
                    break

                for hit in hits:
                    post_id = hit.get("objectID")
                    if post_id and post_id not in seen_ids:
                        seen_ids.add(post_id)
                        url = hit.get("url", "")
                        domain = ""
                        if url:
                            try:
                                domain = urlparse(url).netloc.replace("www.", "")
                            except:
                                pass

                        all_posts.append({
                            "title": hit.get("title", ""),
                            "url": url,
                            "domain": domain,
                            "score": hit.get("points", 0),
                            "submitter": hit.get("author", ""),
                            "timestamp": hit.get("created_at", ""),
                            "num_comments": hit.get("num_comments", 0),
                        })

                pbar.set_postfix({"total": len(all_posts)})
                time.sleep(0.2)
                
            except Exception as e:
                print(f"Error: {e}")
                break

        if len(all_posts) >= target_count:
            break

    return all_posts[:target_count]

print("‚úì Data fetching function defined (time-based pagination)")

In [None]:
# =============================================================================
# DATA FETCHING - V1-style distribution (Critical Fix)
# =============================================================================
# 
# V3.1 BUG: Medium posts (30-50) were labeled as hits, contaminating the 
# positive class with posts that objectively did NOT reach 100 points.
#
# V3.2 FIX: Three clear categories, all labeled by strict threshold:
#   - HIGH:   100+ points  ‚Üí label = 1 (true hits)
#   - MEDIUM: 20-99 points ‚Üí label = 0 (near-misses, critical for boundary)
#   - LOW:    1-19 points  ‚Üí label = 0 (clear misses)
#
# This ensures the model learns the TRUE decision boundary at 100 points.
# =============================================================================

print("Fetching HN posts (V1-style distribution)...")
print("-" * 60)

# Category 1: HITS (score >= 100) - TRUE POSITIVES
print(f"\nüìà Fetching {POSTS_PER_CATEGORY:,} HIT posts (score ‚â• {HIT_THRESHOLD})...")
hits = fetch_hn_posts(
    min_points=HIT_THRESHOLD, 
    max_points=None, 
    target_count=POSTS_PER_CATEGORY, 
    months_back=24
)
print(f"   ‚Üí Got {len(hits):,} posts (mean score: {np.mean([p['score'] for p in hits]):.0f})")

# Category 2: MEDIUM (score 20-99) - NEAR-MISS NEGATIVES (critical for boundary learning)
print(f"\nüìä Fetching {POSTS_PER_CATEGORY:,} MEDIUM posts (score 20-99)...")
medium = fetch_hn_posts(
    min_points=20, 
    max_points=99, 
    target_count=POSTS_PER_CATEGORY, 
    months_back=24
)
print(f"   ‚Üí Got {len(medium):,} posts (mean score: {np.mean([p['score'] for p in medium]):.0f})")

# Category 3: LOW (score 1-19) - CLEAR NEGATIVES
print(f"\nüìâ Fetching {POSTS_PER_CATEGORY:,} LOW posts (score 1-19)...")
low = fetch_hn_posts(
    min_points=1, 
    max_points=19, 
    target_count=POSTS_PER_CATEGORY, 
    months_back=24
)
print(f"   ‚Üí Got {len(low):,} posts (mean score: {np.mean([p['score'] for p in low]):.0f})")

# =============================================================================
# STRICT LABELING - No contamination
# =============================================================================
for p in hits:
    p["label"] = 1  # Only true hits get label=1

for p in medium:
    p["label"] = 0  # ALL medium posts are label=0 (not hits!)

for p in low:
    p["label"] = 0  # Clear misses

# Combine datasets
all_posts = hits + medium + low
df = pd.DataFrame(all_posts)
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

# Data quality checks
n_hits = (df["label"] == 1).sum()
n_misses = (df["label"] == 0).sum()
hit_rate = n_hits / len(df)

print("\n" + "=" * 60)
print("DATA SUMMARY")
print("=" * 60)
print(f"  Total posts:     {len(df):,}")
print(f"  Hits (label=1):  {n_hits:,} ({hit_rate:.1%})")
print(f"  Misses (label=0): {n_misses:,} ({1-hit_rate:.1%})")
print(f"  Class ratio:     1:{n_misses/n_hits:.1f} (neg:pos)")
print("-" * 60)
print("  Score distribution by label:")
print(f"    Hits:   min={df[df['label']==1]['score'].min()}, max={df[df['label']==1]['score'].max()}, mean={df[df['label']==1]['score'].mean():.0f}")
print(f"    Misses: min={df[df['label']==0]['score'].min()}, max={df[df['label']==0]['score'].max()}, mean={df[df['label']==0]['score'].mean():.0f}")
print("=" * 60)

# Verify no label contamination
contaminated = df[(df["label"] == 1) & (df["score"] < HIT_THRESHOLD)]
if len(contaminated) > 0:
    print(f"‚ö†Ô∏è  WARNING: {len(contaminated)} posts labeled as hits but score < {HIT_THRESHOLD}")
else:
    print("‚úì Label integrity verified: No contamination detected")

## 4. Feature Engineering

In [None]:
# =============================================================================
# MINIMAL FEATURE ENGINEERING - Only Genuinely Useful Features
# =============================================================================
#
# V3.1 had 25 features but RoBERTa dominated at 84.4%. Most features added noise.
# V3.2: We keep only features that are:
#   1. Content-based (not identity-based like author/domain history)
#   2. Proven useful in V3.1's feature importance analysis
#   3. Not learnable from the title text alone (complementary to transformer)
#
# These features will be used for a simple calibration adjustment, not stacking.
# =============================================================================

class MinimalFeatureEngineer:
    """Extract only features that complement the transformer's semantic understanding."""
    
    def __init__(self):
        pass
    
    def fit(self, df):
        """No fitting needed - all features are rule-based."""
        return self
    
    def transform(self, df):
        """Extract minimal, interpretable features."""
        features = pd.DataFrame(index=df.index)
        
        # ----- Post Type Indicators (high signal, not in title semantics) -----
        features["is_show_hn"] = df["title"].str.contains(r"^Show HN:", case=False, regex=True).fillna(False).astype(int)
        features["is_ask_hn"] = df["title"].str.contains(r"^Ask HN:", case=False, regex=True).fillna(False).astype(int)
        
        # ----- URL Type (categorical, complements title) -----
        features["has_url"] = (df["url"].str.len() > 0).fillna(False).astype(int)
        features["is_github"] = df["url"].str.contains("github.com", case=False).fillna(False).astype(int)
        features["is_pdf"] = df["url"].str.contains(r"\.pdf($|\?)", case=False, regex=True).fillna(False).astype(int)
        
        # ----- Title Structure (not semantics) -----
        features["title_length"] = df["title"].str.len().fillna(0)
        features["title_words"] = df["title"].str.split().str.len().fillna(0)
        features["has_question"] = df["title"].str.contains(r"\?", regex=True).fillna(False).astype(int)
        features["has_number"] = df["title"].str.contains(r"\d", regex=True).fillna(False).astype(int)
        
        # ----- Time Features (cyclical encoding for posting time effects) -----
        if "timestamp" in df.columns:
            ts = pd.to_datetime(df["timestamp"], errors="coerce")
            hour = ts.dt.hour.fillna(12)  # Default to noon if missing
            dow = ts.dt.dayofweek.fillna(2)  # Default to Wednesday
            
            # Cyclical encoding preserves continuity (23:00 is close to 00:00)
            features["hour_sin"] = np.sin(2 * np.pi * hour / 24)
            features["hour_cos"] = np.cos(2 * np.pi * hour / 24)
            features["dow_sin"] = np.sin(2 * np.pi * dow / 7)
            features["dow_cos"] = np.cos(2 * np.pi * dow / 7)
        
        return features.fillna(0)

# Initialize and fit (no-op but maintains interface)
feature_engineer = MinimalFeatureEngineer()
feature_engineer.fit(train_df)

# Transform
train_features = feature_engineer.transform(train_df)
val_features = feature_engineer.transform(val_df)
test_features = feature_engineer.transform(test_df)

print(f"‚úì Extracted {train_features.shape[1]} minimal features:")
for col in train_features.columns:
    print(f"   ‚Ä¢ {col}")
print("\nNote: These features are for post-hoc analysis, not stacking.")

In [None]:
# Train/validation/test split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=SEED, stratify=df["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED, stratify=temp_df["label"])

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")

# Fit feature engineer on training data
feature_engineer = FeatureEngineer()
feature_engineer.fit(train_df)

# Transform all sets
train_features = feature_engineer.transform(train_df)
val_features = feature_engineer.transform(val_df)
test_features = feature_engineer.transform(test_df)

print(f"\n‚úì Engineered {train_features.shape[1]} features")
print(f"Features: {list(train_features.columns)}")

## 5. RoBERTa Training - Clean Loss Function

**Why Label-Smoothed CrossEntropy instead of Focal Loss?**

| Aspect | Focal Loss (V3.1) | Label-Smoothed CE (V3.2) |
|--------|-------------------|--------------------------|
| **Class imbalance** | Œ±=0.25 overcorrected | Inverse-frequency weights (data-driven) |
| **Confidence calibration** | Œ≥=2 focuses on hard examples | Label smoothing reduces overconfidence |
| **Gradient behavior** | Down-weights easy examples | Stable gradients throughout training |
| **Empirical result** | 760 FN vs 219 FP (too conservative) | Balanced errors expected |

**Loss function:**
$$\mathcal{L} = -\sum_i w_i \cdot [(1-\epsilon) \cdot y_i \log(\hat{y}_i) + \frac{\epsilon}{K} \sum_k \log(\hat{y}_{i,k})]$$

Where:
- $w_i$ = inverse class frequency weight
- $\epsilon$ = 0.1 (label smoothing factor)
- $K$ = 2 (number of classes)

In [None]:
# =============================================================================
# WEIGHTED CROSS-ENTROPY WITH LABEL SMOOTHING
# =============================================================================
#
# Statistical justification for class weights:
#   - Inverse frequency weighting: w_c = N / (K * n_c)
#   - This ensures each class contributes equally to the loss gradient
#   - Avoids the arbitrary choice of Focal Loss Œ± parameter
#
# Label smoothing interpretation:
#   - Instead of hard targets [0, 1], use soft targets [Œµ/K, 1-Œµ+Œµ/K]
#   - Prevents overconfident predictions
#   - Acts as regularization, improving generalization
# =============================================================================

class WeightedCrossEntropyTrainer(Trainer):
    """
    Custom trainer with:
    1. Inverse-frequency class weights (statistically principled)
    2. Label smoothing for calibration
    """
    
    def __init__(self, *args, class_weights=None, label_smoothing=0.1, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.label_smoothing = label_smoothing
        
        # Pre-compute smoothed targets
        # For label=0: [1-Œµ+Œµ/2, Œµ/2] = [0.95, 0.05]
        # For label=1: [Œµ/2, 1-Œµ+Œµ/2] = [0.05, 0.95]
        self.smooth_pos = 1.0 - label_smoothing + label_smoothing / 2
        self.smooth_neg = label_smoothing / 2
        
        print(f"  Class weights: neg={class_weights[0]:.3f}, pos={class_weights[1]:.3f}")
        print(f"  Label smoothing: {label_smoothing} ‚Üí soft targets [{self.smooth_neg:.2f}, {self.smooth_pos:.2f}]")
    
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Create smoothed one-hot targets
        batch_size = labels.size(0)
        smooth_targets = torch.zeros(batch_size, 2, device=logits.device)
        smooth_targets[:, 0] = self.smooth_pos  # Default: label=0
        smooth_targets[:, 1] = self.smooth_neg
        
        # Flip for positive labels
        pos_mask = labels == 1
        smooth_targets[pos_mask, 0] = self.smooth_neg
        smooth_targets[pos_mask, 1] = self.smooth_pos
        
        # Compute log-softmax
        log_probs = F.log_softmax(logits, dim=-1)
        
        # Weighted cross-entropy with smooth targets
        # Loss = -sum(w * target * log_prob)
        weights = torch.tensor(self.class_weights, device=logits.device)
        sample_weights = weights[labels]  # Per-sample weight based on true class
        
        loss_per_sample = -(smooth_targets * log_probs).sum(dim=-1)
        loss = (loss_per_sample * sample_weights).mean()
        
        return (loss, outputs) if return_outputs else loss

# Calculate class weights from training data (inverse frequency)
n_neg = (train_labels == 0).sum()
n_pos = (train_labels == 1).sum()
n_total = len(train_labels)
n_classes = 2

# Inverse frequency: w_c = N / (K * n_c)
weight_neg = n_total / (n_classes * n_neg)
weight_pos = n_total / (n_classes * n_pos)

# Normalize so weights sum to n_classes (standard practice)
weight_sum = weight_neg + weight_pos
CLASS_WEIGHTS = [weight_neg * n_classes / weight_sum, weight_pos * n_classes / weight_sum]

print("=" * 60)
print("CLASS WEIGHTING (Inverse Frequency)")
print("=" * 60)
print(f"  Training set: {n_neg:,} negatives, {n_pos:,} positives")
print(f"  Raw weights: neg={weight_neg:.3f}, pos={weight_pos:.3f}")
print(f"  Normalized:  neg={CLASS_WEIGHTS[0]:.3f}, pos={CLASS_WEIGHTS[1]:.3f}")
print("=" * 60)

In [None]:
# =============================================================================
# TOKENIZATION
# =============================================================================

print("Tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_data(texts, labels):
    """Tokenize texts for transformer input."""
    tokens = tokenizer(
        texts.tolist(),
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    tokens["labels"] = labels.tolist()
    return Dataset.from_dict(tokens)

train_dataset = tokenize_data(train_df["title"], train_df["label"])
val_dataset = tokenize_data(val_df["title"], val_df["label"])
test_dataset = tokenize_data(test_df["title"], test_df["label"])

# Store labels as numpy for later evaluation
train_labels = np.array(train_df["label"])
val_labels = np.array(val_df["label"])
test_labels = np.array(test_df["label"])

print(f"‚úì Tokenized datasets:")
print(f"   Train: {len(train_dataset):,} samples")
print(f"   Val:   {len(val_dataset):,} samples")
print(f"   Test:  {len(test_dataset):,} samples")

In [None]:
# =============================================================================
# ROBERTA TRAINING - End-to-End (No Stacking)
# =============================================================================
#
# V3.2 Philosophy: Let the transformer do what it does best.
# - No feature engineering in the loop
# - No ensemble averaging
# - Clean labels + proper loss = optimal learning signal
# =============================================================================

print("=" * 60)
print("TRAINING ROBERTA (V3.2 - Clean Architecture)")
print("=" * 60)

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# Training arguments with best practices
training_args = TrainingArguments(
    output_dir="./roberta_v32",
    
    # Training schedule
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,  # Can use larger batch for eval
    
    # Optimization
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    warmup_ratio=WARMUP_RATIO,
    lr_scheduler_type="linear",  # Linear decay after warmup
    
    # Evaluation & checkpointing
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",  # Optimize for discrimination, not loss
    greater_is_better=True,
    
    # Efficiency
    fp16=True,
    dataloader_num_workers=2,
    
    # Logging
    logging_steps=100,
    logging_first_step=True,
    report_to="none",
    
    # Reproducibility
    seed=SEED,
    data_seed=SEED,
)

def compute_metrics(eval_pred):
    """Comprehensive metrics for model selection."""
    logits, labels = eval_pred
    probs = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
    
    # ROC AUC (primary metric for model selection)
    roc_auc = roc_auc_score(labels, probs)
    
    # Precision/Recall at various thresholds
    precisions, recalls, thresholds = precision_recall_curve(labels, probs)
    
    # Find F1-optimal threshold
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_f1_idx = np.argmax(f1_scores)
    best_f1 = f1_scores[best_f1_idx]
    best_threshold = thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 0.5
    
    # Metrics at 0.5 threshold (standard)
    preds_05 = (probs >= 0.5).astype(int)
    
    return {
        "roc_auc": roc_auc,
        "avg_precision": average_precision_score(labels, probs),
        "best_f1": best_f1,
        "best_threshold": best_threshold,
        "accuracy_05": accuracy_score(labels, preds_05),
        "precision_05": precision_score(labels, preds_05, zero_division=0),
        "recall_05": recall_score(labels, preds_05, zero_division=0),
    }

# Initialize trainer with weighted loss
trainer = WeightedCrossEntropyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    class_weights=CLASS_WEIGHTS,
    label_smoothing=LABEL_SMOOTHING,
)

# Train!
print("\nStarting training...")
train_result = trainer.train()

print("\n" + "=" * 60)
print("TRAINING COMPLETE")
print("=" * 60)
print(f"  Total steps: {train_result.global_step}")
print(f"  Training loss: {train_result.training_loss:.4f}")

In [None]:
# =============================================================================
# EXTRACT PREDICTIONS - Raw probabilities from RoBERTa
# =============================================================================

def get_predictions(dataset, batch_size=64, desc="Extracting predictions"):
    """Get probability predictions from the trained model."""
    model.eval()
    all_probs = []
    
    # Prepare dataset without labels for inference
    dataset_no_labels = dataset.remove_columns(["labels"])
    dataset_no_labels.set_format("torch")
    
    with torch.no_grad():
        for i in tqdm(range(0, len(dataset_no_labels), batch_size), desc=desc):
            batch = dataset_no_labels[i:i+batch_size]
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[:, 1].cpu().numpy()
            all_probs.extend(probs)
    
    return np.array(all_probs)

# Get predictions for all splits
print("Extracting predictions from trained model...")
train_probs_raw = get_predictions(train_dataset, desc="Train set")
val_probs_raw = get_predictions(val_dataset, desc="Val set")
test_probs_raw = get_predictions(test_dataset, desc="Test set")

# Evaluate raw model performance
print("\n" + "=" * 60)
print("RAW MODEL PERFORMANCE (before calibration)")
print("=" * 60)
print(f"  Train ROC AUC: {roc_auc_score(train_labels, train_probs_raw):.4f}")
print(f"  Val ROC AUC:   {roc_auc_score(val_labels, val_probs_raw):.4f}")
print(f"  Test ROC AUC:  {roc_auc_score(test_labels, test_probs_raw):.4f}")
print(f"  Test Avg Precision: {average_precision_score(test_labels, test_probs_raw):.4f}")
print("=" * 60)

## 6. Isotonic Calibration (Preserved from V3.1)

Isotonic regression provides non-parametric probability calibration:
- Maps raw model outputs to well-calibrated probabilities
- Preserves ranking (monotonic transformation)
- Achieved ECE = 0.01 in V3.1 (excellent)

Unlike temperature scaling (parametric), isotonic regression can correct non-linear miscalibration patterns.

In [None]:
# =============================================================================
# ISOTONIC CALIBRATION
# =============================================================================
#
# Why isotonic regression?
# - Non-parametric: No assumptions about the form of miscalibration
# - Monotonic: Preserves model ranking (ROC AUC unchanged)
# - Proven: ECE = 0.01 in V3.1
#
# We fit on validation set to avoid overfitting to test.
# =============================================================================

def calculate_ece(probs, labels, n_bins=10):
    """
    Calculate Expected Calibration Error.
    
    ECE = Œ£ (|bin_count| / N) * |accuracy(bin) - confidence(bin)|
    
    Lower is better. ECE < 0.05 is considered well-calibrated.
    """
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    ece = 0.0
    
    for i in range(n_bins):
        in_bin = (probs > bin_boundaries[i]) & (probs <= bin_boundaries[i+1])
        if in_bin.sum() > 0:
            bin_accuracy = labels[in_bin].mean()
            bin_confidence = probs[in_bin].mean()
            ece += (in_bin.sum() / len(probs)) * abs(bin_accuracy - bin_confidence)
    
    return ece

print("Applying Isotonic Regression calibration...")

# Fit calibrator on validation set
calibrator = IsotonicRegression(out_of_bounds="clip")
calibrator.fit(val_probs_raw, val_labels)

# Apply to all sets
train_probs = calibrator.predict(train_probs_raw)
val_probs = calibrator.predict(val_probs_raw)
test_probs = calibrator.predict(test_probs_raw)

# Measure calibration improvement
ece_before = calculate_ece(test_probs_raw, test_labels)
ece_after = calculate_ece(test_probs, test_labels)

print("\n" + "=" * 60)
print("CALIBRATION RESULTS")
print("=" * 60)
print(f"  ECE before calibration: {ece_before:.4f}")
print(f"  ECE after calibration:  {ece_after:.4f}")
if ece_before > 0:
    print(f"  Improvement: {((ece_before - ece_after) / ece_before * 100):.1f}%")
print(f"  Well-calibrated: {'‚úì Yes' if ece_after < 0.05 else '‚úó No'} (threshold: 0.05)")
print("=" * 60)

## 7. Threshold Optimization & Evaluation

**Threshold Selection Strategies:**
1. **F1-Optimal**: Maximizes F1 score (harmonic mean of precision/recall)
2. **Balanced P/R**: Where precision ‚âà recall
3. **High-Precision**: For use cases that can't tolerate false positives

The optimal threshold depends on your use case:
- RSS reader highlighting ‚Üí probably want higher precision (fewer false alerts)
- Content discovery ‚Üí balanced or high recall (don't miss good content)

In [None]:
# =============================================================================
# THRESHOLD OPTIMIZATION
# =============================================================================

print("Analyzing classification thresholds...")

# Get precision-recall curve
precisions, recalls, thresholds = precision_recall_curve(test_labels, test_probs)

# Strategy 1: F1-Optimal threshold
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_f1_idx = np.argmax(f1_scores)
f1_threshold = thresholds[best_f1_idx] if best_f1_idx < len(thresholds) else 0.5

# Strategy 2: Balanced P/R threshold
balanced_diff = np.abs(precisions[:-1] - recalls[:-1])
balanced_idx = np.argmin(balanced_diff)
balanced_threshold = thresholds[balanced_idx]

# Strategy 3: High-precision thresholds
def find_precision_threshold(target_precision, min_recall=0.2):
    """Find threshold achieving target precision with minimum recall."""
    valid = recalls[:-1] >= min_recall
    diffs = np.abs(precisions[:-1] - target_precision)
    diffs[~valid] = np.inf
    if np.all(np.isinf(diffs)):
        return 0.5
    idx = np.argmin(diffs)
    return thresholds[idx]

precision_60_threshold = find_precision_threshold(0.60)
precision_70_threshold = find_precision_threshold(0.70)
precision_80_threshold = find_precision_threshold(0.80, min_recall=0.15)

print("\n" + "=" * 60)
print("THRESHOLD ANALYSIS")
print("=" * 60)
print(f"\n{'Strategy':<25} {'Threshold':>10} {'Precision':>10} {'Recall':>10} {'F1':>10}")
print("-" * 65)

thresholds_to_eval = [
    ("F1-Optimal", f1_threshold),
    ("Balanced P/R", balanced_threshold),
    ("Precision ~60%", precision_60_threshold),
    ("Precision ~70%", precision_70_threshold),
    ("Precision ~80%", precision_80_threshold),
    ("Standard (0.5)", 0.5),
]

for name, thresh in thresholds_to_eval:
    preds = (test_probs >= thresh).astype(int)
    p = precision_score(test_labels, preds, zero_division=0)
    r = recall_score(test_labels, preds, zero_division=0)
    f1 = f1_score(test_labels, preds, zero_division=0)
    print(f"{name:<25} {thresh:>10.3f} {p:>10.3f} {r:>10.3f} {f1:>10.3f}")

# Use F1-optimal as default (can be changed based on use case)
OPTIMAL_THRESHOLD = f1_threshold
print(f"\n‚úì Selected default threshold: {OPTIMAL_THRESHOLD:.3f} (F1-optimal)")
print("=" * 60)

In [None]:
# =============================================================================
# VISUALIZATIONS
# =============================================================================

fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# 1. ROC Curve with version comparison
ax1 = axes[0, 0]
fpr, tpr, _ = roc_curve(test_labels, test_probs)
ax1.plot(fpr, tpr, "b-", linewidth=2, label=f"V3.2 (AUC={roc_auc:.3f})")
ax1.plot([0, 1], [0, 1], "k--", alpha=0.5, label="Random")
ax1.fill_between(fpr, tpr, alpha=0.2)

# Add reference points for previous versions
ax1.scatter([0.33], [0.77], color="green", s=100, marker="^", zorder=5, label=f"V1 (AUC=0.770)")
ax1.scatter([0.30], [0.72], color="orange", s=100, marker="s", zorder=5, label=f"V3.1 (AUC=0.715)")

ax1.set_xlabel("False Positive Rate", fontsize=12)
ax1.set_ylabel("True Positive Rate", fontsize=12)
ax1.set_title("ROC Curve - Version Comparison", fontsize=14)
ax1.legend(loc="lower right")
ax1.grid(True, alpha=0.3)

# 2. Precision-Recall Curve - FIX: slice both arrays consistently
ax2 = axes[0, 1]
# precision_recall_curve returns: precisions (n+1,), recalls (n+1,), thresholds (n,)
# We need to plot precisions[:-1] vs recalls[:-1] to match thresholds
ax2.plot(recalls[:-1], precisions[:-1], "g-", linewidth=2, label=f"V3.2 (AP={avg_precision:.3f})")
ax2.axhline(y=test_labels.mean(), color="r", linestyle="--", alpha=0.5, 
            label=f"Random (AP={test_labels.mean():.3f})")
ax2.scatter([recall], [precision], color="red", s=150, zorder=5, 
            label=f"Operating point (P={precision:.2f}, R={recall:.2f})")
ax2.set_xlabel("Recall", fontsize=12)
ax2.set_ylabel("Precision", fontsize=12)
ax2.set_title("Precision-Recall Curve", fontsize=14)
ax2.legend(loc="upper right")
ax2.grid(True, alpha=0.3)
ax2.set_xlim([0, 1])
ax2.set_ylim([0, 1])

# 3. Calibration Plot
ax3 = axes[1, 0]
n_bins = 10
bin_means, bin_true, bin_counts = [], [], []
for i in range(n_bins):
    low, high = i / n_bins, (i + 1) / n_bins
    mask = (test_probs >= low) & (test_probs < high)
    if mask.sum() > 0:
        bin_means.append(test_probs[mask].mean())
        bin_true.append(test_labels[mask].mean())
        bin_counts.append(mask.sum())

ax3.plot([0, 1], [0, 1], "k--", label="Perfect calibration", linewidth=2)
scatter = ax3.scatter(bin_means, bin_true, s=[c/10 for c in bin_counts], alpha=0.7, 
                      c=bin_counts, cmap="Blues", label="V3.2 Calibrated")
ax3.plot(bin_means, bin_true, "b-", alpha=0.5)
ax3.set_xlabel("Mean Predicted Probability", fontsize=12)
ax3.set_ylabel("Fraction of Positives", fontsize=12)
ax3.set_title(f"Calibration Plot (ECE={ece_after:.4f})", fontsize=14)
ax3.legend(loc="upper left")
ax3.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax3, label="Bin count")

# 4. Confusion Matrix
ax4 = axes[1, 1]
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt=",", cmap="Blues", ax=ax4,
            xticklabels=["Predicted Miss", "Predicted Hit"],
            yticklabels=["Actual Miss", "Actual Hit"],
            annot_kws={"size": 14})
ax4.set_title(f"Confusion Matrix (threshold={OPTIMAL_THRESHOLD:.2f})", fontsize=14)

plt.tight_layout()
plt.savefig("v32_evaluation_plots.png", dpi=150, bbox_inches="tight")
plt.show()
print("\n‚úì Plots saved to v32_evaluation_plots.png")

In [None]:
# Save all model components
print("Saving model components...")
os.makedirs("./hn_model_v32", exist_ok=True)

# Save RoBERTa model
model.save_pretrained("./hn_model_v32/roberta")
tokenizer.save_pretrained("./hn_model_v32/roberta")
print("‚úì RoBERTa model saved")

# Save calibrator
joblib.dump(calibrator, "./hn_model_v32/isotonic_calibrator.joblib")
print("‚úì Isotonic calibrator saved")

# Save feature engineer
joblib.dump(feature_engineer, "./hn_model_v32/feature_engineer.joblib")
print("‚úì Feature engineer saved")

# Save configuration
config = {
    "version": "v3.2",
    "architecture": "Pure RoBERTa + Label-Smoothed CE + Isotonic Calibration",
    "description": "Clean architecture - removed TF-IDF and LightGBM stacking",
    "optimal_threshold": float(OPTIMAL_THRESHOLD),
    "label_smoothing": LABEL_SMOOTHING,
    "metrics": {
        "roc_auc": float(roc_auc),
        "average_precision": float(avg_precision),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "ece": float(ece_after)
    },
    "confusion_matrix": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)},
    "training_config": {
        "model_name": MODEL_NAME,
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "hit_threshold": HIT_THRESHOLD
    }
}

with open("./hn_model_v32/config.json", "w") as f:
    json.dump(config, f, indent=2)
print("‚úì Configuration saved")

# Create zip for download
!cd hn_model_v32 && zip -r ../hn_model_v32.zip .
print("\n‚úì All components saved to hn_model_v32.zip")

# List files
!ls -la hn_model_v32/

In [None]:
# Download the model zip
from google.colab import files
files.download("hn_model_v32.zip")