# Mini AI Pipeline: AG News Headline Classification

This notebook follows the small-pipeline plan: baseline keyword rules vs. MiniLM embeddings + linear classifier. Run end-to-end (<10 minutes on CPU).

## Setup
Install lightweight dependencies if needed. Comment out in managed environments that already include them.

In [None]:
# !pip install -q datasets scikit-learn sentence-transformers torch pandas numpy

In [None]:
import random
from collections import Counter
import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

## Load and subsample data
Using AG News, keep a tiny subset for speed: 2k train, 500 test.

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

dataset = load_dataset('ag_news')
label_names = dataset['train'].features['label'].names

train_df = dataset['train'].to_pandas().sample(n=2000, random_state=SEED).reset_index(drop=True)
test_df = dataset['test'].to_pandas().sample(n=500, random_state=SEED).reset_index(drop=True)

def preprocess(text: str) -> str:
    return text.lower().strip()

train_df['text'] = train_df['text'].apply(preprocess)
test_df['text'] = test_df['text'].apply(preprocess)
train_df.head()

## Baseline: keyword rules
Simple keyword counts per class; ties broken by majority prior.

In [None]:
keywords = {
    'World': {'war', 'government', 'minister', 'president', 'iraq', 'election', 'peace'},
    'Sports': {'win', 'wins', 'victory', 'coach', 'season', 'game', 'team', 'vs', 'cup'},
    'Business': {'market', 'profit', 'shares', 'stocks', 'deal', 'company', 'dollar', 'trade'},
    'Sci/Tech': {'software', 'research', 'technology', 'chip', 'internet', 'science', 'phone', 'data'}
}
label_to_name = dict(enumerate(label_names))
name_to_label = {v: k for k, v in label_to_name.items()}
majority_label = train_df['label'].mode().iloc[0]

def baseline_predict(text: str) -> int:
    tokens = text.split()
    scores = {name: 0 for name in keywords}
    for token in tokens:
        for cls, vocab in keywords.items():
            if token in vocab:
                scores[cls] += 1
    best_name = max(scores.items(), key=lambda x: (x[1], x[0]))[0]
    if scores[best_name] == 0:
        return majority_label
    return name_to_label[best_name]

baseline_preds = [baseline_predict(t) for t in test_df['text']]
baseline_acc = accuracy_score(test_df['label'], baseline_preds)
baseline_f1 = f1_score(test_df['label'], baseline_preds, average='macro')
print(f'Baseline accuracy: {baseline_acc:.3f}, macro-F1: {baseline_f1:.3f}')

## AI pipeline: MiniLM embeddings + logistic regression
Embed headlines, then train a linear classifier.

In [None]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

train_embeddings = encoder.encode(train_df['text'].tolist(), batch_size=64, show_progress_bar=True, convert_to_numpy=True, device='cuda' if encoder.device.type == 'cuda' else None)
test_embeddings = encoder.encode(test_df['text'].tolist(), batch_size=64, show_progress_bar=True, convert_to_numpy=True, device='cuda' if encoder.device.type == 'cuda' else None)

clf = LogisticRegression(max_iter=1000, C=4.0, multi_class='multinomial', n_jobs=-1)
clf.fit(train_embeddings, train_df['label'])

preds = clf.predict(test_embeddings)
acc = accuracy_score(test_df['label'], preds)
macro_f1 = f1_score(test_df['label'], preds, average='macro')
print(f'Pipeline accuracy: {acc:.3f}, macro-F1: {macro_f1:.3f}')

## Qualitative differences
Collect examples where baseline and pipeline disagree.

In [None]:
diffs = []
for text, true_label, base_pred, model_pred in zip(test_df['text'], test_df['label'], baseline_preds, preds):
    if base_pred != model_pred:
        diffs.append({
            'text': text,
            'true': label_to_name[true_label],
            'baseline': label_to_name[base_pred],
            'pipeline': label_to_name[model_pred],
        })

pd.DataFrame(diffs).head(10)

## Classification report (optional detail)

In [None]:
print(classification_report(test_df['label'], preds, target_names=label_names))