#MODEL ARCHITECTURE FROM
https://arxiv.org/pdf/2406.00367

Changes: Changed from BiLSTM classification to CNN

In [None]:
#download dataset to local/virtual environment and set up python libraries

# !gdown "https://drive.google.com/uc?id=1YeV-FnAWkPQkpTPgf-ShXhb3SbTb65RP"
# !gdown 'https://drive.google.com/uc?export=download&id=1w9h_rF7-ziI-rqiXTi0SnWEV9nkpDQlh' -O config.json
!pip install torch transformers pandas numpy scikit-learn nltk



# Mount Google Drive and load the dataset

In [None]:

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

#for nonlocal runtime
# from google.colab import drive
# drive.mount('/content/drive')
# import pandas as pd
# df = pd.read_csv("/content/drive/My Drive/Kaggle Club/SARCASM PROJECT '25/cleaned_reddit_comments.csv").fillna(' ')

# Load the CSV dataset (adjust the file path as needed)
df = pd.read_csv('cleaned_NewsHeadlines_comments.csv', usecols=['is_sarcastic', 'headline']).fillna('')
df = pd.read_json("config.json", lines = True).fillna('')

print("Dataset size:", len(df))
print(df['is_sarcastic'].value_counts())
df.head()

Dataset size: 28619
is_sarcastic
0    14985
1    13634
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to /home/user96/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user96/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
# First, split into (train+validation) vs. test (90% vs. 10%)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.10,
    random_state=42,
    stratify=df['is_sarcastic']
)

# Now split the train_val_df into training and validation sets (90% of 90% becomes 81%, and 10% of 90% becomes 9%)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.10,
    random_state=42,
    stratify=train_val_df['is_sarcastic']
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 23181
Validation size: 2576
Test size: 2862


#Model Architecture


1. RoBERTa Encoder:
* Load a pretrained RoBERTa (roberta-base) model and its tokenizer from the HuggingFace library.
* Use RoBERTa to obtain the embedding matrix for the input text.
2. Dropout Layer:
* Applied to the embeddings to prevent overfitting. In the paper, a dropout rate of 0.1 is used.
3. ~~BiLSTM~~ CNN Layer:
* Processes the embeddings bidirectionally to capture long-range dependencies.
* The paper experiments with different hidden unit sizes (e.g., 128, 256, 512). For the CNN, the effective hidden size is doubled due to its forward and backward processing.
4. Flatten and Dense Layers:
* The output of the CNN is flattened.
* One or two fully connected layers are used to capture the relationship between the features and the final sentiment classes.
5. Classification (Softmax) Layer:
* A Softmax function is applied to output the probability distribution over sentiment classes (e.g., positive, negative, neutral).

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup

class RoBERTaCNN(nn.Module):
    def __init__(self, roberta, num_filters, filter_sizes, dropout=0.1):
        super().__init__()
        self.roberta = roberta
        hidden_size = roberta.config.hidden_size

        # Convs for each kernel size + BatchNorm
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_size, num_filters, fs),
                nn.BatchNorm1d(num_filters),
                nn.ReLU()
            )
            for fs in filter_sizes
        ])

        self.dropout = nn.Dropout(dropout)
        # single logit
        self.fc = nn.Linear(num_filters * len(filter_sizes), 1)

    def forward(self, input_ids, attention_mask):
        # RoBERTa embeddings
        seq_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        x = seq_out.transpose(1, 2)   # (B, hidden, L)

        # conv → pool
        feats = []
        for conv in self.convs:
            c = conv(x)                # (B, F, L_out)
            p = F.max_pool1d(c, c.size(2)).squeeze(2)
            feats.append(p)
        x = torch.cat(feats, dim=1)    # (B, F * #sizes)

        x = self.dropout(x)
        logits = self.fc(x).squeeze(-1) # (B,)
        return logits

    def tokenize(self, texts, max_length=128):
        enc = self.tokenizer(texts,
                             padding=True,
                             truncation=True,
                             max_length=max_length,
                             return_tensors='pt')
        return enc.input_ids, enc.attention_mask

# example instantiation
if __name__ == "__main__":
    model_name = "roberta-base"
    roberta = RobertaModel.from_pretrained(model_name)
    tokenizer = RobertaTokenizer.from_pretrained(model_name)

    model = RoBERTaCNN(roberta, num_filters=100, filter_sizes=[2,3,4], dropout=0.1)
    model.tokenizer = tokenizer  # attach tokenizer for convenience

    texts = ["I loved this movie", "Totally awful experience"]
    ids, mask = model.tokenize(texts)
    logits = model(ids, mask)
    probs = torch.sigmoid(logits)
    print("Logits:", logits)
    print("Probabilities:", probs)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([ 0.1258, -0.9527], grad_fn=<SqueezeBackward1>)
Probabilities: tensor([0.5314, 0.2783], grad_fn=<SigmoidBackward0>)


# Training + Hyperparameters

* **Learning Rates (l)**: Experiment with l ∈ {0.0001, 0.00001, 0.000001}. In the best-case experimental settings (as per the paper), use 0.00001.
* **Hidden Units (h)**: For the RNN layer, experiment with 128, 256, or 512 units. ~~Note that for BiLSTM, the effective output dimension is 2×h.~~
* **Dropout Rate**: 0.1
* **Epochs**: ~~5 (as stated in the paper)~~ used early stopping w/ patience level of 5
* **Optimizer**: AdamW is recommended.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Hyperparameters
epochs = 20

class NewsHeadlinesDataset(Dataset):
    def __init__(self, df):
        # Store the preprocessed texts and labels
        self.labels = df['is_sarcastic'].values
        self.comments = df['headline'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return the comment and label for a given index
        comment = str(self.comments[idx])
        label = self.labels[idx]
        return comment, label

# Create dataset objects
train_dataset = NewsHeadlinesDataset(train_df)
val_dataset = NewsHeadlinesDataset(val_df)
test_dataset = NewsHeadlinesDataset(test_df)

# Create DataLoaders for batching (adjust batch_size as needed)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# ===== Training Loop (binary) =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer + scheduler
optimizer = optim.AdamW(model.parameters(), lr=1e-6, weight_decay=0.01)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1*total_steps),
                                            num_training_steps=total_steps)

criterion = nn.BCEWithLogitsLoss()

best_val = float('inf')
patience = 5
stalled = 0

for epoch in range(1, epochs+1):
    model.train()
    train_loss = 0
    for texts, labels in train_loader:
        ids, mask = model.tokenize(list(texts))
        ids, mask = ids.to(device), mask.to(device)
        labels = torch.tensor(labels, dtype=torch.float, device=device)

        optimizer.zero_grad()
        logits = model(ids, mask)
        loss = criterion(logits, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    avg_train = train_loss / len(train_loader)

    # validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            ids, mask = model.tokenize(list(texts))
            ids, mask = ids.to(device), mask.to(device)
            labels = torch.tensor(labels, dtype=torch.float, device=device)

            logits = model(ids, mask)
            val_loss += criterion(logits, labels).item()

    avg_val = val_loss / len(val_loader)
    print(f"Epoch {epoch} — Train: {avg_train:.4f} — Val: {avg_val:.4f}")

    if avg_val < best_val:
        best_val = avg_val
        stalled = 0
    else:
        stalled += 1
        if stalled >= patience:
            print(f"Early stop at epoch {epoch}")
            break

  labels = torch.tensor(labels, dtype=torch.float, device=device)
  labels = torch.tensor(labels, dtype=torch.float, device=device)


Epoch 1 — Train: 0.7407 — Val: 0.6501
Epoch 2 — Train: 0.5302 — Val: 0.3672
Epoch 3 — Train: 0.3169 — Val: 0.2722
Epoch 4 — Train: 0.2538 — Val: 0.2450
Epoch 5 — Train: 0.2215 — Val: 0.2186
Epoch 6 — Train: 0.1983 — Val: 0.2098
Epoch 7 — Train: 0.1875 — Val: 0.2233
Epoch 8 — Train: 0.1711 — Val: 0.1894
Epoch 9 — Train: 0.1655 — Val: 0.2014
Epoch 10 — Train: 0.1518 — Val: 0.1855
Epoch 11 — Train: 0.1418 — Val: 0.1844
Epoch 12 — Train: 0.1355 — Val: 0.2220
Epoch 13 — Train: 0.1314 — Val: 0.2060
Epoch 14 — Train: 0.1270 — Val: 0.2241
Epoch 15 — Train: 0.1193 — Val: 0.2015
Epoch 16 — Train: 0.1143 — Val: 0.1997
Early stop at epoch 16


#Evaluation Metrics

In [None]:
# ===== Evaluation =====
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for texts, labels in test_loader:
        ids, mask = model.tokenize(list(texts))
        ids, mask = ids.to(device), mask.to(device)
        logits = model(ids, mask)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long().cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels)

acc = accuracy_score(all_labels, all_preds)
p, r, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print("Test Accuracy:", acc)
print("Precision:", p, "Recall:", r, "F1:", f1)

Test Accuracy: 0.9322152341020266
Precision: 0.9329418889862621 Recall: 0.9322152341020266 F1: 0.9321087882279159
