## *Ablation Study* : Early Stopping

Implementing an early stop mechanism on the baseline model chosen by us

By - Darshan Chudiwal and Yash Priyadarshi

In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os 

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torchmetrics

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from tqdm import tqdm

from gensim.models import KeyedVectors


In [2]:
TRAIN_PATH = '/kaggle/input/bias-of-us-news-media-houses/Train.xlsx'

def clean_text(text: str) -> str:
    """Clean the input text by removing URLs, 'RT', special characters, and extra whitespace."""
    text = re.sub(r"http\S+", " ", text)  # remove URLs
    text = re.sub(r"RT ", " ", text)       # remove 'RT'
    text = re.sub(r"[^a-zA-Z\'\.\,\d\s]", " ", text)  # remove unwanted characters
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def load_and_preprocess_data(path: str) -> pd.DataFrame:
    df = pd.read_excel(path, engine='openpyxl')
    for col in ['content_original', 'title']:
        # Use only the first section if split by the delimiter
        df[col] = df[col].str.split(' \n\n---\n\n').str[0]
        # Replace dashes and punctuation, then lower the text
        df[col] = (df[col]
                   .str.replace('-', ' ')
                   .str.replace('[^\w\s]', '', regex=True)
                   .str.replace('\n', ' ')
                   .str.lower())
        df[col] = df[col].apply(clean_text)
    return df

# Load and preview the data
df_train = load_and_preprocess_data(TRAIN_PATH)
df_train.head(2)


Unnamed: 0.1,Unnamed: 0,topic,source,bias,url,title,date,authors,content,content_original,source_url,bias_text,ID
0,0,immigration,National Review,2,https://www.nationalreview.com/2018/12/governm...,shutdown theater again,2018-12-12,"Kevin D. Williamson, Kyle Smith, Andrew C. Mcc...",President Trump and Senate Minority Leader Chu...,president trump and senate minority leader chu...,www.nationalreview.com,right,zl7kc7EmAyIdUMIo
1,1,culture,Yahoo! The 360,1,https://news.yahoo.com/can-the-developing-worl...,can the developing world endure the coronavirus,2020-06-30,Mike Bebernes,“ The 360 ” shows you diverse perspectives on ...,the 360 shows you diverse perspectives on the ...,www.news.yahoo.com,center,xpbjYTJYPdlw6HmJ


In [3]:
word2vec_model_path = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
cached_kv_path = 'word2vec_model.kv'

if os.path.exists(cached_kv_path):
    print("Loading cached Word2Vec model...")
    w2v_model = KeyedVectors.load(cached_kv_path, mmap='r')
else:
    print("Loading Word2Vec model from binary file (this may take several minutes)...")
    w2v_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
    print("Saving cached version...")
    w2v_model.save(cached_kv_path)

# Build vocabulary mapping and embedding matrix.
# Reserve index 0 for unknown words.
UNK_TOKEN = "<unk>"
UNK_INDEX = 0
# Create vocabulary mapping, shifting indices by 1 so that index 0 is reserved for UNK.
word2vec_vocab = {word: idx + 1 for word, idx in w2v_model.key_to_index.items()}
vocab_size = len(word2vec_vocab) + 1
embedding_dim = w2v_model.vector_size  # typically 300

# Build the embedding matrix (row 0 will remain zeros for UNK)
embedding_matrix = torch.zeros(vocab_size, embedding_dim)
for word, idx in word2vec_vocab.items():
    embedding_matrix[idx] = torch.tensor(w2v_model[word])
    
print(f"Word2Vec embeddings loaded: vocab size = {vocab_size}, embedding dim = {embedding_dim}")


Loading Word2Vec model from binary file (this may take several minutes)...
Saving cached version...
Word2Vec embeddings loaded: vocab size = 3000001, embedding dim = 300


In [4]:
def basic_tokenizer(text: str) -> list:
    """Tokenize text by splitting on whitespace."""
    return text.split()

def tokenize_and_pad(text: str, max_len: int) -> torch.Tensor:
    """
    Tokenizes the input text and converts tokens to indices using the Word2Vec vocabulary.
    Unknown tokens are assigned index 0. The sequence is padded or truncated to max_len.
    """
    tokens = basic_tokenizer(text)
    indices = [word2vec_vocab.get(token, UNK_INDEX) for token in tokens]
    padded = indices[:max_len] + [0] * max(0, max_len - len(indices))
    return torch.tensor(padded)


In [5]:
class BiasDataset(Dataset):
    def __init__(self, data: pd.DataFrame, max_sentence_length: int):
        self.data = data.reset_index(drop=True)
        self.max_sentence_length = max_sentence_length

    def __len__(self) -> int:
        return len(self.data)
  
    def __getitem__(self, index: int) -> dict:
        row = self.data.iloc[index]
        body = tokenize_and_pad(row['content_original'], self.max_sentence_length)
        title = tokenize_and_pad(row['title'], self.max_sentence_length)
        # Assuming 'bias' is an integer label
        label = torch.tensor(row['bias'], dtype=torch.long)
        return {"body": body, "title": title, "labels": label}

class BiasDataModule(pl.LightningDataModule):
    def __init__(self, train_df: pd.DataFrame, val_df: pd.DataFrame, max_sentence_length: int, batch_size: int):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.max_sentence_length = max_sentence_length
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = BiasDataset(self.train_df, self.max_sentence_length)
        self.val_dataset = BiasDataset(self.val_df, self.max_sentence_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size,
                          shuffle=True, num_workers=4, drop_last=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size,
                          shuffle=False, num_workers=4, drop_last=False)

# Split the dataset (e.g., 80% train, 20% validation)
train_df, val_df = train_test_split(df_train, test_size=0.2, random_state=42)
print("Train shape:", train_df.shape, "Val shape:", val_df.shape)

BATCH_SIZE = 1024
MAX_LEN = 120  # Adjust based on your text lengths
data_module = BiasDataModule(train_df, val_df, max_sentence_length=MAX_LEN, batch_size=BATCH_SIZE)
data_module.setup()


Train shape: (21272, 13) Val shape: (5318, 13)


In [6]:
class LitBiasModel(pl.LightningModule):
    def __init__(self, embedding_matrix: torch.Tensor, num_classes: int,
                 embed_dim: int, hidd_dim: int, num_layers: int = 2, lr: float = 1e-3):
        super().__init__()
        self.save_hyperparameters(ignore=["embedding_matrix"])
        # Frozen embedding layer using pretrained Word2Vec embeddings
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        
        # Shared bidirectional LSTM layer
        self.bilstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidd_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.4,
            bidirectional=True
        )
        
        
        # Classification head (feedforward network)
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(hidd_dim, 64),
            nn.BatchNorm1d(64),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(32, num_classes)
        )
        
        self.loss_fn = nn.CrossEntropyLoss()
        self.train_acc = torchmetrics.Accuracy(num_classes=num_classes, task="multiclass")
        self.val_acc = torchmetrics.Accuracy(num_classes=num_classes, task="multiclass")
        self._train_losses = []
        self._val_losses = []
        # Dictionaries to store history for later plotting:
        self.train_history = {"acc": [], "loss": []}
        self.val_history = {"acc": [], "loss": []}

    def forward(self, title: torch.Tensor, body: torch.Tensor) -> torch.Tensor:
        # Obtain embeddings
        title_embed = self.embedding(title)   # (batch, seq_len, embed_dim)
        body_embed = self.embedding(body)
        
        # Shared bidirectional LSTM for title and body separately:
        _, (title_hidden, _) = self.bilstm(title_embed)  # title_hidden: (num_layers*2, batch, hidd_dim)
        _, (body_hidden, _) = self.bilstm(body_embed)      # body_hidden: (num_layers*2, batch, hidd_dim)
        
        # Aggregate hidden states by mean pooling over the layers dimension:
        # Permute to (batch, num_layers*2, hidd_dim)
        title_hidden = title_hidden.permute(1, 0, 2)
        body_hidden = body_hidden.permute(1, 0, 2)
        
        # Mean pooling over the layer dimension
        title_repr = title_hidden.mean(dim=1)  # (batch, hidd_dim)
        body_repr = body_hidden.mean(dim=1)    # (batch, hidd_dim)
        
        # Fuse the representations (here we simply average them)
        aggregated = (title_repr + body_repr) / 2  # (batch, hidd_dim)
        
        logits = self.classifier(aggregated)
        return logits

    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["title"], batch["body"])
        loss = self.loss_fn(logits, batch["labels"])
        self._train_losses.append(loss)
        self.train_acc.update(logits, batch["labels"])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["title"], batch["body"])
        loss = self.loss_fn(logits, batch["labels"])
        self._val_losses.append(loss)
        self.val_acc.update(logits, batch["labels"])
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def on_train_epoch_end(self):
        avg_loss = torch.stack(self._train_losses).mean() if self._train_losses else torch.tensor(0.0, device=self.device)
        train_acc = self.train_acc.compute()
        self.train_history["loss"].append(avg_loss.item())
        self.train_history["acc"].append(train_acc.item())
        self.log_dict({"Epoch_Train_Loss": avg_loss, "Epoch_Train_Acc": train_acc}, prog_bar=True)
        print(f"Train - Acc: {train_acc:.4f}, Loss: {avg_loss:.4f}")
        self.train_acc.reset()
        self._train_losses.clear()

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self._val_losses).mean() if self._val_losses else torch.tensor(0.0, device=self.device)
        val_acc = self.val_acc.compute()
        self.val_history["loss"].append(avg_loss.item())
        self.val_history["acc"].append(val_acc.item())
        self.log_dict({"Epoch_Val_Loss": avg_loss, "Epoch_Val_Acc": val_acc}, prog_bar=True)
        print(f"Val - Acc: {val_acc:.4f}, Loss: {avg_loss:.4f}")
        self.val_acc.reset()
        self._val_losses.clear()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.hparams.lr)

# Instantiate the ablated model (without MHA)
model = LitBiasModel(
    embedding_matrix=embedding_matrix,
    num_classes=3,
    embed_dim=300,
    hidd_dim=200,
    num_layers=2,
    lr=1e-3
)
print(model)


LitBiasModel(
  (embedding): Embedding(3000001, 300)
  (bilstm): LSTM(300, 200, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  (classifier): Sequential(
    (0): Dropout(p=0.4, inplace=False)
    (1): ReLU()
    (2): Linear(in_features=200, out_features=64, bias=True)
    (3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Dropout(p=0.4, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Dropout(p=0.4, inplace=False)
    (9): ReLU()
    (10): Linear(in_features=32, out_features=3, bias=True)
  )
  (loss_fn): CrossEntropyLoss()
  (train_acc): MulticlassAccuracy()
  (val_acc): MulticlassAccuracy()
)


In [10]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

pl.seed_everything(42)
logger = TensorBoardLogger("tb_logs", name="lit_bias_model")

# Early stopping callback remains the same (monitoring validation loss)
early_stop_callback = EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="min",
    verbose=True
)

# Change the monitor key to 'Epoch_Val_Acc'
checkpoint_callback = ModelCheckpoint(
    monitor="Epoch_Val_Acc",  # updated key
    mode="max",
    save_top_k=1,
    dirpath="checkpoints/", 
    filename="best_model-{epoch:02d}-{Epoch_Val_Acc:.4f}"
)

trainer = pl.Trainer(
    accelerator="gpu",
    devices=1,
    max_epochs=100,  # Increased epochs for potential improvement
    logger=logger,
    log_every_n_steps=10,
    callbacks=[early_stop_callback, checkpoint_callback]
)

trainer.fit(model, data_module)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.4482, Loss: 1.0700


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.6010, Loss: 0.9790
Train - Acc: 0.6051, Loss: 0.8752


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.6427, Loss: 0.8195
Train - Acc: 0.6364, Loss: 0.8355


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.6435, Loss: 0.7997
Train - Acc: 0.6640, Loss: 0.7777


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.6420, Loss: 0.8236
Train - Acc: 0.6994, Loss: 0.7285


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7225, Loss: 0.6661
Train - Acc: 0.7149, Loss: 0.6966


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7025, Loss: 0.7615
Train - Acc: 0.7362, Loss: 0.6607


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7409, Loss: 0.6363
Train - Acc: 0.7566, Loss: 0.6154


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7132, Loss: 0.6465
Train - Acc: 0.7709, Loss: 0.5865


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7452, Loss: 0.6137
Train - Acc: 0.7852, Loss: 0.5597


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7422, Loss: 0.6226
Train - Acc: 0.7901, Loss: 0.5523


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7240, Loss: 0.6716
Train - Acc: 0.7726, Loss: 0.5861


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7548, Loss: 0.6120
Train - Acc: 0.8009, Loss: 0.5247


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7356, Loss: 0.6219
Train - Acc: 0.7972, Loss: 0.5263


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7475, Loss: 0.6249
Train - Acc: 0.8087, Loss: 0.4959


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7471, Loss: 0.5975
Train - Acc: 0.8241, Loss: 0.4705


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7614, Loss: 0.5893
Train - Acc: 0.8316, Loss: 0.4542


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7334, Loss: 0.6412
Train - Acc: 0.8435, Loss: 0.4277


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7706, Loss: 0.6010
Train - Acc: 0.8456, Loss: 0.4200


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7653, Loss: 0.6763
Train - Acc: 0.8601, Loss: 0.3937


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7601, Loss: 0.6933
Train - Acc: 0.8686, Loss: 0.3735


Validation: |          | 0/? [00:00<?, ?it/s]

Val - Acc: 0.7740, Loss: 0.6458
Train - Acc: 0.8776, Loss: 0.3486
