In [28]:
import re
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, precision_score
from tqdm import tqdm
import pandas as pd

In [29]:
from pipeline import load_kaggle, load_politifact, load_gossipcop, clean_dataset
from sklearn.model_selection import train_test_split

df = load_kaggle()
df = clean_dataset(df)
df["text"] = (df["title"] + " " + df["text"]).str.strip()
df = df[["text", "label"]]
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

df_train.head()

print("\n" + "="*80)
print("LOADING ADDITIONAL DATASETS FOR GENERALIZATION TESTING")
print("="*80 + "\n")

# Load PolitiFact
df_politifact = load_politifact()
if df_politifact is not None:
    df_politifact = clean_dataset(df_politifact)
    df_politifact["text"] = (df_politifact["title"] + " " + df_politifact["text"]).str.strip()
    df_politifact = df_politifact[["text", "label"]]
    print(f"PolitiFact loaded: {len(df_politifact)} articles")
else:
    print("⚠️  Skipping PolitiFact (not available)")

# Load GossipCop
df_gossipcop = load_gossipcop()
if df_gossipcop is not None:
    df_gossipcop = clean_dataset(df_gossipcop)
    df_gossipcop["text"] = (df_gossipcop["title"] + " " + df_gossipcop["text"]).str.strip()
    df_gossipcop = df_gossipcop[["text", "label"]]
    print(f"GossipCop loaded: {len(df_gossipcop)} articles")
else:
    print("⚠️  Skipping GossipCop (not available)")

# Show dataset sizes
print(f"\nDataset Summary:")
print(f"   Kaggle train: {len(df_train)} articles")
print(f"   Kaggle test: {len(df_test)} articles")
if df_politifact is not None:
    print(f"   PolitiFact: {len(df_politifact)} articles")
if df_gossipcop is not None:
    print(f"   GossipCop: {len(df_gossipcop)} articles")



LOADING ADDITIONAL DATASETS FOR GENERALIZATION TESTING

Loading PolitiFact from: data_files/processed/politifact_combined.csv
PolitiFact loaded: 624 articles
Loading GossipCop from: data_files/processed/gossipcop_combined.csv
GossipCop loaded: 14549 articles

Dataset Summary:
   Kaggle train: 30915 articles
   Kaggle test: 7729 articles
   PolitiFact: 624 articles
   GossipCop: 14549 articles


In [30]:
def simple_tokenize(text: str):
    # keep it simple and robust
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text.split()

# ---------- vocab builder ----------
PAD, UNK = "<pad>", "<unk>"

def build_vocab(texts, max_vocab=80000, min_freq=2):
    counter = Counter()
    for t in texts:
        counter.update(simple_tokenize(t))

    vocab = {PAD: 0, UNK: 1}
    for tok, freq in counter.most_common():
        if freq < min_freq:
            break
        if tok not in vocab:
            vocab[tok] = len(vocab)
        if len(vocab) >= max_vocab:
            break
    return vocab

def encode(text, vocab, max_len=512):
    toks = simple_tokenize(text)
    ids = [vocab.get(tok, vocab[UNK]) for tok in toks[:max_len]]
    # pad
    if len(ids) < max_len:
        ids += [vocab[PAD]] * (max_len - len(ids))
    return ids

In [31]:
class TextDataset(Dataset):
    def __init__(self, df, vocab, max_len=512):
        self.texts = df["text"].tolist()
        self.labels = df["label"].astype(int).tolist()
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        x = torch.tensor(encode(self.texts[idx], self.vocab, self.max_len), dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y

In [32]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=128, num_layers=1, bidirectional=True, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)

        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0.0,
        )

        out_dim = hidden_dim * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(out_dim, 2)

    def forward(self, x):
        mask = (x != 0).float()          # (B, T)
        emb = self.embedding(x)          # (B, T, E)
        out, _ = self.lstm(emb)          # (B, T, 2H)

        mask = mask.unsqueeze(-1)        # (B, T, 1)
        out = out * mask
        pooled = out.sum(dim=1) / mask.sum(dim=1).clamp(min=1.0)  # (B, 2H)

        pooled = self.dropout(pooled)
        return self.fc(pooled)

In [33]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    losses = []
    all_preds, all_labels = [], []

    progress = tqdm(loader, desc="Training", leave=False)

    for x, y in progress:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()

        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(y.detach().cpu().numpy())

        progress.set_postfix(loss=f"{loss.item():.4f}")

    return (
        np.mean(losses),
        accuracy_score(all_labels, all_preds),
        f1_score(all_labels, all_preds, average="macro"),
        precision_score(all_labels, all_preds, average="macro", zero_division=0),
    )

@torch.no_grad()
def eval_model(model, loader, criterion, device, desc="Evaluating"):
    model.eval()
    losses = []
    all_preds, all_labels = [], []

    for x, y in tqdm(loader, desc=desc):
        x, y = x.to(device), y.to(device)

        logits = model(x)
        loss = criterion(logits, y)

        losses.append(loss.item())
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

    return (
        np.mean(losses),
        accuracy_score(all_labels, all_preds),
        f1_score(all_labels, all_preds, average="macro"),
        precision_score(all_labels, all_preds, average="macro", zero_division=0),
    )

In [41]:
#` ---------- training loop ----------
epochs = 3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab = build_vocab(df_train["text"])
train_ds = TextDataset(df_train, vocab)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

model = LSTMClassifier(vocab_size=len(vocab)).to(device)
counts = df_train["label"].value_counts().sort_index()
weights = torch.tensor([1.0/counts[0], 1.0/counts[1]], dtype=torch.float, device=device)
weights = weights / weights.sum() * 2
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=7e-4, weight_decay=1e-2)

for epoch in range(epochs):
    train_loss, train_acc, train_f1, train_prec = train_one_epoch(
        model, train_loader, optimizer, criterion, device
    )
    print(f"Epoch {epoch+1}: loss={train_loss:.4f}, f1={train_f1:.4f}, accuracy={train_acc:.4f}, precision={train_prec:.4f}")

                                                                        

Epoch 1: loss=0.0749, f1=0.9706, accuracy=0.9709, precision=0.9700


                                                                        

Epoch 2: loss=0.0090, f1=0.9973, accuracy=0.9973, precision=0.9973


                                                                        

Epoch 3: loss=0.0064, f1=0.9979, accuracy=0.9979, precision=0.9979




In [42]:
import json

torch.save(model.state_dict(), "lstm_model.pt")

with open("vocab.json", "w") as f:
    json.dump(vocab, f)

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load vocab
with open("vocab.json", "r") as f:
    vocab = json.load(f)

# Load model
model = LSTMClassifier(vocab_size=len(vocab)).to(device)
model.load_state_dict(torch.load("lstm_model.pt", map_location=device))
model.eval()

criterion = nn.CrossEntropyLoss()

# Evaluate on Kaggle test set

kaggle_test_ds = TextDataset(df_test, vocab)
kaggle_test_loader = DataLoader(
    kaggle_test_ds,
    batch_size=32,
    shuffle=False
)

k_loss, k_acc, k_f1, k_prec = eval_model(
    model,
    kaggle_test_loader,
    criterion,
    device
)

print(f"Kaggle Test Results | Loss: {k_loss:.4f} | Acc: {k_acc:.4f} | F1: {k_f1:.4f} | Prec: {k_prec:.4f}")

# Evaluate on PolitiFact
politifact_ds = TextDataset(df_politifact, vocab)
politifact_loader = DataLoader(
    politifact_ds,
    batch_size=32,
    shuffle=False
)

p_loss, p_acc, p_f1, p_prec = eval_model(
    model,
    politifact_loader,
    criterion,
    device
)
print(f"PolitiFact Results  | Loss: {p_loss:.4f} | Acc: {p_acc:.4f} | F1: {p_f1:.4f} | Prec: {p_prec:.4f}")

# Evaluate on GossipCop
gossipcop_ds = TextDataset(df_gossipcop, vocab)
gossipcop_loader = DataLoader(
    gossipcop_ds,
    batch_size=32,
    shuffle=False
)

g_loss, g_acc, g_f1, g_prec = eval_model(
    model,
    gossipcop_loader,
    criterion,
    device
)
print(f"GossipCop Results   | Loss: {g_loss:.4f} | Acc: {g_acc:.4f} | F1: {g_f1:.4f} | Prec: {g_prec:.4f}")

Evaluating: 100%|██████████| 242/242 [00:27<00:00,  8.82it/s]


Kaggle Test Results | Loss: 0.0286 | Acc: 0.9902 | F1: 0.9901 | Prec: 0.9894


Evaluating: 100%|██████████| 20/20 [00:02<00:00,  9.45it/s]


PolitiFact Results  | Loss: 4.5474 | Acc: 0.5128 | F1: 0.3612 | Prec: 0.5882


Evaluating: 100%|██████████| 455/455 [00:52<00:00,  8.63it/s]

GossipCop Results   | Loss: 9.5921 | Acc: 0.2346 | F1: 0.1921 | Prec: 0.4709





## Mixed Dataset Tuning

In [None]:
kaggle_sample = df_train.sample(n=20000, random_state=42)
politifact_sample = df_politifact.sample(frac=0.5, random_state=42)
gossipcop_sample = df_gossipcop.sample(frac=0.5, random_state=42)

# Build mixed dataframe like you did
df_mixed = pd.concat([kaggle_sample, politifact_sample, gossipcop_sample], ignore_index=True)
df_mixed = df_mixed.sample(frac=1, random_state=42).reset_index(drop=True)

# Split mixed into train/val
df_mixed_train, df_mixed_val = train_test_split(
    df_mixed, test_size=0.1, random_state=42, stratify=df_mixed["label"]
)

# Build vocab from mixed training text (important)
vocab = build_vocab(df_mixed_train["text"], max_vocab=80000, min_freq=2)

train_loader = DataLoader(TextDataset(df_mixed_train, vocab, max_len=512), batch_size=32, shuffle=True)
val_loader   = DataLoader(TextDataset(df_mixed_val, vocab, max_len=512), batch_size=32, shuffle=False)

model = LSTMClassifier(vocab_size=len(vocab)).to(device)

# (Recommended) class weights
counts = df_mixed_train["label"].value_counts().sort_index()
weights = torch.tensor([1.0/counts[0], 1.0/counts[1]], dtype=torch.float, device=device)
weights = weights / weights.sum() * 2
criterion = nn.CrossEntropyLoss(weight=weights)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3)

for ep in range(3):
    tr = train_one_epoch(model, train_loader, optimizer, criterion, device)
    va = eval_model(model, val_loader, criterion, device)
    print(f"Epoch {ep+1} | train f1={tr[2]:.4f} | val f1={va[2]:.4f}")

Evaluating: 100%|██████████| 87/87 [00:06<00:00, 12.49it/s]             


Epoch 1 | train f1=0.8568 | val f1=0.9079


Evaluating: 100%|██████████| 87/87 [00:06<00:00, 12.53it/s]             


Epoch 2 | train f1=0.9367 | val f1=0.9311


Evaluating: 100%|██████████| 87/87 [00:07<00:00, 12.30it/s]             

Epoch 3 | train f1=0.9662 | val f1=0.9244





In [14]:
df_kaggle_holdout = df_test.copy()

# PolitiFact: hold out the rows NOT included in politifact_sample
politifact_holdout = df_politifact.drop(index=politifact_sample.index).reset_index(drop=True)

# GossipCop: hold out the rows NOT included in gossipcop_sample
gossipcop_holdout = df_gossipcop.drop(index=gossipcop_sample.index).reset_index(drop=True)

print("Holdout sizes:")
print("  Kaggle holdout:", len(df_kaggle_holdout))
print("  PolitiFact holdout:", len(politifact_holdout))
print("  GossipCop holdout:", len(gossipcop_holdout))

Holdout sizes:
  Kaggle holdout: 7729
  PolitiFact holdout: 312
  GossipCop holdout: 7275


In [15]:
batch_size = 32
max_len = 512

kaggle_holdout_loader = DataLoader(
    TextDataset(df_kaggle_holdout, vocab, max_len=max_len),
    batch_size=batch_size,
    shuffle=False
)

politifact_holdout_loader = DataLoader(
    TextDataset(politifact_holdout, vocab, max_len=max_len),
    batch_size=batch_size,
    shuffle=False
)

gossipcop_holdout_loader = DataLoader(
    TextDataset(gossipcop_holdout, vocab, max_len=max_len),
    batch_size=batch_size,
    shuffle=False
)

In [16]:
k_loss, k_acc, k_f1, k_prec = eval_model(model, kaggle_holdout_loader, criterion, device)
p_loss, p_acc, p_f1, p_prec = eval_model(model, politifact_holdout_loader, criterion, device)
g_loss, g_acc, g_f1, g_prec = eval_model(model, gossipcop_holdout_loader, criterion, device)

print(f"Kaggle Holdout     | Loss: {k_loss:.4f} | Acc: {k_acc:.4f} | F1: {k_f1:.4f} | Prec: {k_prec:.4f}")
print(f"PolitiFact Holdout | Loss: {p_loss:.4f} | Acc: {p_acc:.4f} | F1: {p_f1:.4f} | Prec: {p_prec:.4f}")
print(f"GossipCop Holdout  | Loss: {g_loss:.4f} | Acc: {g_acc:.4f} | F1: {g_f1:.4f} | Prec: {g_prec:.4f}")

Evaluating: 100%|██████████| 242/242 [00:28<00:00,  8.61it/s]
Evaluating: 100%|██████████| 10/10 [00:01<00:00,  8.39it/s]
Evaluating: 100%|██████████| 228/228 [00:27<00:00,  8.20it/s]

Kaggle Holdout     | Loss: 0.0386 | Acc: 0.9887 | F1: 0.9886 | Prec: 0.9897
PolitiFact Holdout | Loss: 1.0348 | Acc: 0.6154 | F1: 0.6044 | Prec: 0.6522
GossipCop Holdout  | Loss: 0.6264 | Acc: 0.7737 | F1: 0.6922 | Prec: 0.6887



