# Sarcasm Detection using RCNN-RoBERTa


## Model Configuration
- **Architecture Parameters:**
  - RoBERTa base model
  - LSTM hidden size: 64
  - Dropout: 0.1
  - Number of classes: 2

- **Training Parameters:**
  - Batch size: 16
  - Learning rate: 2e-5
  - Weight decay: 1e-5
  - Number of epochs: 5
  - Optimizer: AdamW

## Hardware Requirements
The code supports multiple computing devices:
- Apple M1/M2 (MPS)
- NVIDIA GPU (CUDA)
- CPU (fallback)


## Setup and Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics import classification_report, confusion_matrix
import logging
from tqdm import tqdm

# Set up logging

In [5]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Dataset Class
Custom dataset class for handling headline data and tokenization.

In [8]:
class HeadlineDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_length=128):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, idx):
        headline = str(self.headlines[idx])
        label = 1 if self.labels[idx] == 'sarcastic' else 0
        
        encoding = self.tokenizer.encode_plus(
            headline,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## Model Architecture
Implementation of the RCNN-RoBERTa model combining RoBERTa with LSTM and CNN-like processing.


In [11]:
class RCNNRoBERTa(nn.Module):
    def __init__(self, n_classes=2, dropout=0.1, lstm_hidden_size=64):
        super(RCNNRoBERTa, self).__init__()
        
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.hidden_size = self.roberta.config.hidden_size
        self.lstm_hidden_size = lstm_hidden_size
        
        self.lstm = nn.LSTM(
            input_size=self.hidden_size,
            hidden_size=self.lstm_hidden_size,
            bidirectional=True,
            batch_first=True
        )
        
        self.W = nn.Linear(
            self.hidden_size + 2 * self.lstm_hidden_size,
            self.hidden_size
        )
        
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        hidden_states = roberta_output.last_hidden_state
        lstm_output, _ = self.lstm(hidden_states)
        
        concat = torch.cat((hidden_states, lstm_output), dim=2)
        conv_input = torch.tanh(self.W(concat))
        
        pooled = torch.max(conv_input, dim=1)[0]
        dropped = self.dropout(pooled)
        logits = self.classifier(dropped)
        
        return logits



## Training Functions

### Training Epoch

In [15]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(dataloader, desc='Training')
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    return total_loss / len(dataloader), correct_predictions / total_predictions



   

### Evaluation

In [18]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device, non_blocking=True)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)

            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return (
        total_loss / len(dataloader),
        classification_report(all_labels, all_predictions),
        confusion_matrix(all_labels, all_predictions)
    )




### Complete Training Pipeline

In [21]:
def train_model(train_df, val_df):
    torch.manual_seed(42)
    np.random.seed(42)

    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    train_dataset = HeadlineDataset(train_df['headline'].values, train_df['label'].values, tokenizer)
    val_dataset = HeadlineDataset(val_df['headline'].values, val_df['label'].values, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=16, num_workers=0)

    device = torch.device('mps' if torch.backends.mps.is_available() else 
                          'cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")

    model = RCNNRoBERTa().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-5)
    criterion = nn.CrossEntropyLoss()

    num_epochs = 5
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        logger.info(f'\nEpoch {epoch + 1}/{num_epochs}')
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        logger.info(f'Training Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}')

        val_loss, val_report, val_conf_matrix = evaluate(model, val_loader, criterion, device)
        logger.info(f'Validation Loss: {val_loss:.4f}')
        logger.info(f'\nClassification Report:\n{val_report}')
        logger.info(f'\nConfusion Matrix:\n{val_conf_matrix}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_sarcasm_model.pt')
            logger.info('Saved best model checkpoint')

    return model, tokenizer


if __name__ == "__main__":
    df = pd.read_csv('Headlines.csv')
    
    train_df, val_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['label']
    )
    
    model, tokenizer = train_model(train_df, val_df)
    

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

INFO:__main__:Using device: mps


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
INFO:__main__:
Epoch 1/5
Training: 100%|██████████| 1336/1336 [12:18<00:00,  1.81it/s, loss=0.0392]
INFO:__main__:Training Loss: 0.2583, Accuracy: 0.8897
INFO:__main__:Validation Loss: 0.1575
INFO:__main__:
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      2997
           1       0.94      0.92      0.93      2345

    accuracy                           0.94      5342
   macro avg       0.94      0.93      0.94      5342
weighted avg       0.94      0.94      0.94      5342

INFO:__main__:
Confusion Matrix:
[[2849  148]
 [ 191 2154]]
INFO:__main__:Saved best model checkpoint
INFO:__main__:
Epoch 2/5
Training: 100%|██████████

## Prediction Function

In [24]:
def predict_headline(model, headline, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        headline,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device, non_blocking=True)
    attention_mask = encoding['attention_mask'].to(device, non_blocking=True)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probabilities = torch.softmax(outputs, dim=1)
        prediction = torch.argmax(probabilities, dim=1)

    return {
        'prediction': 'sarcastic' if prediction.item() == 1 else 'not sarcastic',
        'confidence': probabilities[0][prediction.item()].item()
    }

In [26]:
 # Test prediction 1
test_headline = "mr. falafel' owner does not actually like being addressed as mr. falafel"
result = predict_headline(model, test_headline, tokenizer, 'mps' if torch.backends.mps.is_available() else 
                        'cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nTest prediction for: '{test_headline}'")
print(f"Prediction: {result['prediction']}")
print(f"Confidence: {result['confidence']:.2f}")


Test prediction for: 'mr. falafel' owner does not actually like being addressed as mr. falafel'
Prediction: sarcastic
Confidence: 1.00


In [28]:
 # Test prediction 2
test_headline = "scalia's utter moral failure exposed"
result = predict_headline(model, test_headline, tokenizer, 'mps' if torch.backends.mps.is_available() else 
                        'cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nTest prediction for: '{test_headline}'")
print(f"Prediction: {result['prediction']}")
print(f"Confidence: {result['confidence']:.2f}")


Test prediction for: 'scalia's utter moral failure exposed'
Prediction: not sarcastic
Confidence: 0.98
