<a href="https://colab.research.google.com/github/olcaykursun/ML/blob/main/Fall2025/Week14/20news_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Practice Notebook: 20-Newsgroups Classification
This notebook contains Model A (Embeddings - similar to Week 13 notebook) and Model B (TF-IDF). Currently, they have pretty much the same accuracy. Which approach can you improve easier and how?

In [2]:
# ==============================
# Model A: Learned Word Embeddings
# ==============================

import nltk
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

# -------------------------
# 0. Setup
# -------------------------
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# -------------------------
# 1. Load dataset
# -------------------------
data = fetch_20newsgroups(subset="all", remove=("headers","footers","quotes"))
texts = data.data
labels = data.target
num_classes = len(data.target_names)

# -------------------------
# 2. Tokenizer (Keras)
# -------------------------
max_vocab = 20000
tok = Tokenizer(num_words=max_vocab, oov_token="<UNK>")
tok.fit_on_texts(texts)
seqs = tok.texts_to_sequences(texts)
index_word = tok.index_word   # ID → word

# -------------------------
# 3. Remove stopwords
# -------------------------
def remove_sw(seq):
    return [t for t in seq if index_word.get(t, "") not in stop_words]

clean = [remove_sw(s) for s in seqs]

# -------------------------
# 4. Trim + pad to first 250 tokens
# -------------------------
max_len = 250
trimmed = [s[:max_len] for s in clean]

X = np.zeros((len(trimmed), max_len), dtype=np.int64)
for i, seq in enumerate(trimmed):
    X[i, :len(seq)] = seq

y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# -------------------------
# 5. Dataset class
# -------------------------
class NewsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_ds = NewsDataset(X_train, y_train)
test_ds  = NewsDataset(X_test,  y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds, batch_size=64)

# -------------------------
# 6. Model: Embed → AvgPool → Classifier
# -------------------------
class AvgDocClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        emb = self.embed(x)        # (B, 250, dim)
        avg = emb.mean(dim=1)      # average pooling
        return self.fc(avg)

vocab_size = min(max_vocab, len(tok.word_index) + 1)
embed_dim = 64

device = "cuda" if torch.cuda.is_available() else "cpu"
modelA = AvgDocClassifier(vocab_size, embed_dim, num_classes).to(device)

optimizer = optim.Adam(modelA.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# -------------------------
# 7. Train 10 epochs
# -------------------------
for epoch in range(10):
    correct = 0
    total = 0
    modelA.train()

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()

        logits = modelA(Xb)
        loss = criterion(logits, yb)

        loss.backward()
        optimizer.step()

        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)

    print(f"Epoch {epoch+1}, Train Acc: {correct/total:.3f}")

# -------------------------
# 8. Evaluate
# -------------------------
modelA.eval()
correct, total = 0, 0

with torch.no_grad():
    for Xb, yb in test_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        preds = modelA(Xb).argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)

print("\nEmbedding Model Test Accuracy:", correct/total)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Epoch 1, Train Acc: 0.094
Epoch 2, Train Acc: 0.197
Epoch 3, Train Acc: 0.300
Epoch 4, Train Acc: 0.411
Epoch 5, Train Acc: 0.487
Epoch 6, Train Acc: 0.555
Epoch 7, Train Acc: 0.605
Epoch 8, Train Acc: 0.637
Epoch 9, Train Acc: 0.667
Epoch 10, Train Acc: 0.689

Embedding Model Test Accuracy: 0.6259285461620092


In [11]:
seqs = tok.texts_to_sequences(["this is a very rare wordadad"])
print(seqs)
print(index_word[16])
print(index_word[1])

[[16, 10, 5, 112, 3541, 1]]
this
<UNK>


In [1]:
# ============================
# Model B: TF-IDF + MLP
# ============================

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# ----------------------
# 1. Load dataset
# ----------------------
data = fetch_20newsgroups(subset="all", remove=("headers","footers","quotes"))
texts = data.data
labels = data.target
num_classes = len(data.target_names)

# ----------------------
# 2. TF-IDF
# ----------------------
vec = TfidfVectorizer(max_features=2000, stop_words="english")
X = vec.fit_transform(texts).toarray()

y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train = torch.tensor(X_train, dtype=torch.float32)
X_test  = torch.tensor(X_test,  dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test  = torch.tensor(y_test, dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=64, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_test, y_test), batch_size=64)

# ----------------------
# 3. Tiny MLP
# ----------------------
class TfidfMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2000, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )
    def forward(self, x):
        return self.net(x)

modelB = TfidfMLP()
optimizer = torch.optim.Adam(modelB.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# ----------------------
# 4. Train 5 epochs
# ----------------------
for epoch in range(5):
    correct, total = 0, 0
    for Xb, yb in train_loader:
        optimizer.zero_grad()
        logits = modelB(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        preds = logits.argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)

    print(f"Epoch {epoch+1}, Train Acc: {correct/total:.3f}")

# ----------------------
# 5. Evaluate
# ----------------------
modelB.eval()
correct, total = 0, 0

with torch.no_grad():
    for Xb, yb in test_loader:
        preds = modelB(Xb).argmax(dim=1)
        correct += (preds == yb).sum().item()
        total += len(yb)

print("\nTF-IDF Model Test Accuracy:", correct/total)


Epoch 1, Train Acc: 0.461
Epoch 2, Train Acc: 0.658
Epoch 3, Train Acc: 0.707
Epoch 4, Train Acc: 0.739
Epoch 5, Train Acc: 0.769

TF-IDF Model Test Accuracy: 0.6204244031830238


We can improve the embedding model much more easily, because TF-IDF is a fixed representation with limited expressiveness (it ignores context as it works as a bag-of-words, and therefore also ignores word order), whereas the embeddings approach can be upgraded with pretrained word embeddings and using CNNs, LSTMs, and Transformers to capture richer semantic and contextual information.