#MODEL ARCHITECTURE FROM
https://arxiv.org/pdf/2406.00367

In [None]:
#download dataset to local/virtual environment and set up python libraries

# !gdown "https://drive.google.com/uc?id=1YeV-FnAWkPQkpTPgf-ShXhb3SbTb65RP"
!pip install torch transformers pandas numpy scikit-learn nltk



# Mount Google Drive and load the dataset

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

#for nonlocal runtime
# from google.colab import drive
# drive.mount('/content/drive')
# import pandas as pd
# df = pd.read_csv("/content/drive/My Drive/Kaggle Club/SARCASM PROJECT '25/cleaned_reddit_comments.csv").fillna(' ')

# Load the CSV dataset (adjust the file path as needed)
df = pd.read_csv('cleaned_NewsHeadlines_comments.csv', usecols=['is_sarcastic', 'headline']).fillna('')

print("Dataset size:", len(df))
print(df['is_sarcastic'].value_counts())
df.head()

Dataset size: 28619
is_sarcastic
0    14985
1    13634
Name: count, dtype: int64


[nltk_data] Downloading package stopwords to /home/user96/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/user96/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,is_sarcastic,headline
0,1,thirtysomethe scientist unveil doomsday clock ...
1,0,dem rep totally nail why congress be fall shor...
2,0,eat your veggie 9 deliciously different recipe
3,1,inclement weather prevent liar get to work
4,1,mother come pretty close to use word streaming...


In [None]:
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer

# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def preprocess_text(text):
#     # Lowercase the text
#     text = text.lower()
#     # Remove URLs
#     text = re.sub(r'http\S+|www\S+|https\S+', '', text)
#     # Remove non-alphanumeric characters; keep letters and spaces
#     text = re.sub(r'[^a-z\s]', '', text)
#     # Tokenize text
#     tokens = text.split()
#     # Remove stopwords
#     tokens = [word for word in tokens if word not in stop_words]
#     # Lemmatize tokens
#     tokens = [lemmatizer.lemmatize(word) for word in tokens]
#     return ' '.join(tokens)

# # Apply preprocessing to the comment column
# df['comment'] = df['comment'].apply(preprocess_text)
# df.head()

In [None]:
# First, split into (train+validation) vs. test (90% vs. 10%)
train_val_df, test_df = train_test_split(
    df,
    test_size=0.10,
    random_state=42,
    stratify=df['is_sarcastic']
)

# Now split the train_val_df into training and validation sets (90% of 90% becomes 81%, and 10% of 90% becomes 9%)
train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.10,
    random_state=42,
    stratify=train_val_df['is_sarcastic']
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

Train size: 23181
Validation size: 2576
Test size: 2862


#Model Architecture


1. RoBERTa Encoder:
* Load a pretrained RoBERTa (roberta-base) model and its tokenizer from the HuggingFace library.
* Use RoBERTa to obtain the embedding matrix for the input text.
2. Dropout Layer:
* Applied to the embeddings to prevent overfitting. In the paper, a dropout rate of 0.1 is used.
3. BiLSTM Layer:
* Processes the embeddings bidirectionally to capture long-range dependencies.
* The paper experiments with different hidden unit sizes (e.g., 128, 256, 512). For the BiLSTM, the effective hidden size is doubled due to its forward and backward processing.
4. Flatten and Dense Layers:
* The output of the BiLSTM is flattened.
* One or two fully connected layers are used to capture the relationship between the features and the final sentiment classes.
5. Classification (Softmax) Layer:
* A Softmax function is applied to output the probability distribution over sentiment classes (e.g., positive, negative, neutral).

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer

class RoBERTa_BiLSTM(nn.Module):
    def __init__(self, hidden_size=256, num_classes=2, dropout_rate=0.1):
        super(RoBERTa_BiLSTM, self).__init__()
        # Load pretrained RoBERTa model and tokenizer
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        # BiLSTM: input size from RoBERTa’s hidden size (768)
        self.bilstm = nn.LSTM(input_size=768, hidden_size=hidden_size,
                              num_layers=1, batch_first=True,
                              bidirectional=True)

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)
        # After BiLSTM, output feature size becomes hidden_size * 2
        self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # Obtain embeddings from RoBERTa (last hidden state)
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # shape: (batch_size, seq_len, 768)

        # Apply dropout
        x = self.dropout(sequence_output)
        # Process with BiLSTM layer
        lstm_out, _ = self.bilstm(x)  # shape: (batch_size, seq_len, hidden_size*2)

        # Use mean pooling over the sequence length to aggregate features
        pooled_output = torch.mean(lstm_out, dim=1)  # shape: (batch_size, hidden_size*2)

        # Final classification layer
        logits = self.fc(pooled_output)
        probabilities = self.softmax(logits)
        return logits, probabilities

    def tokenize_texts(self, texts, max_length=128):
        # Tokenize and encode the texts using the RoBERTa tokenizer
        encoding = self.tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        return encoding['input_ids'], encoding['attention_mask']

# Example usage:
if __name__ == "__main__":
    model = RoBERTa_BiLSTM(hidden_size=256, num_classes=2, dropout_rate=0.1)
    texts = ["this movie was great", "this was a bad experience"]
    input_ids, attention_mask = model.tokenize_texts(texts)

    logits, probabilities = model(input_ids, attention_mask)
    print("Logits:", logits)
    print("Probabilities:", probabilities)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Logits: tensor([[-0.0531,  0.0143],
        [-0.0439,  0.0206]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.4832, 0.5168],
        [0.4839, 0.5161]], grad_fn=<SoftmaxBackward0>)


# Training + Hyperparameters

* **Learning Rates (l)**: Experiment with l ∈ {0.0001, 0.00001, 0.000001}. In the best-case experimental settings (as per the paper), use 0.00001.
* **Hidden Units (h)**: For the RNN layer, experiment with 128, 256, or 512 units. Note that for BiLSTM, the effective output dimension is 2×h.
* **Dropout Rate**: 0.1
* **Epochs**: 5 (as stated in the paper)
* **Optimizer**: AdamW is recommended.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class NewsHeadlinesDataset(Dataset):
    def __init__(self, df):
        # Store the preprocessed texts and labels
        self.labels = df['is_sarcastic'].values
        self.comments = df['headline'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return the comment and label for a given index
        comment = str(self.comments[idx])
        label = self.labels[idx]
        return comment, label

# Create dataset objects
train_dataset = NewsHeadlinesDataset(train_df)
val_dataset = NewsHeadlinesDataset(val_df)
test_dataset = NewsHeadlinesDataset(test_df)

# Create DataLoaders for batching (adjust batch_size as needed)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)  # or try lr in {1e-4, 1e-6}
criterion = nn.BCEWithLogitsLoss()

epochs = 20
patience = 5 # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
trigger_times = 0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for comments, labels in train_loader:
        # Tokenize batch of comments
        input_ids, attention_mask = model.tokenize_texts(list(comments))
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        logits, _ = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_train_loss = epoch_loss / len(train_loader)

    # Evaluate on the validation set
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for comments, labels in val_loader:
            input_ids, attention_mask = model.tokenize_texts(list(comments))
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = torch.tensor(labels).to(device)

            logits, _ = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

    # Check for early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"Early stopping activated. No improvement in validation loss for {patience} consecutive epochs.")
            break

  labels = torch.tensor(labels).to(device)
  labels = torch.tensor(labels).to(device)


Epoch 1/20 - Train Loss: 0.0879 - Val Loss: 0.4122
Epoch 2/20 - Train Loss: 0.0793 - Val Loss: 0.3409
Epoch 3/20 - Train Loss: 0.0647 - Val Loss: 0.3529
Epoch 4/20 - Train Loss: 0.0565 - Val Loss: 0.3649
Epoch 5/20 - Train Loss: 0.0523 - Val Loss: 0.4347
Epoch 6/20 - Train Loss: 0.0467 - Val Loss: 0.3663
Epoch 7/20 - Train Loss: 0.0407 - Val Loss: 0.3330
Epoch 8/20 - Train Loss: 0.0354 - Val Loss: 0.3559
Epoch 9/20 - Train Loss: 0.0351 - Val Loss: 0.4151
Epoch 10/20 - Train Loss: 0.0344 - Val Loss: 0.4362
Epoch 11/20 - Train Loss: 0.0330 - Val Loss: 0.4406
Epoch 12/20 - Train Loss: 0.0304 - Val Loss: 0.4021
Early stopping activated. No improvement in validation loss for 5 consecutive epochs.


#Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for comments, labels in test_loader:
        input_ids, attention_mask = model.tokenize_texts(list(comments))
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = torch.tensor(labels).to(device)

        logits, _ = model(input_ids, attention_mask)
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

print("Test Accuracy:", accuracy)
print("Weighted Precision:", precision)
print("Weighted Recall:", recall)
print("Weighted F1-Score:", f1)

  labels = torch.tensor(labels).to(device)


Test Accuracy: 0.8983228511530398
Weighted Precision: 0.898512501241357
Weighted Recall: 0.8983228511530398
Weighted F1-Score: 0.898232422618565
