## BERT-DS2

### Managing imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm



## Without context

#### Loading dataset 2

In [None]:
# Only read the first 4400 rows and specific columns
cols_to_use = ['label', 'comment', 'parent_comment']

df = pd.read_csv('../Dataset/train-balanced-sarcasm.csv', usecols=cols_to_use, nrows=4400)

# Rename columns
df = df.rename(columns={
    'comment': 'response',
    'parent_comment': 'context'
})

df

#### Dataset 2 preparation


In [None]:
df['input_text'] = df['response']

# Check the result
print(df[['input_text', 'label']].head())
df

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'input_text': X_test, 'label': y_test})


print("Train and test splits saved!")

### Transformers based models : RoBERTa

#### Training


In [None]:
X_train_roberta = train_df['input_text'].astype(str).tolist()
y_train_roberta = train_df['label'].tolist()
X_test_roberta = test_df['input_text'].astype(str).tolist()
y_test_roberta = test_df['label'].tolist()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
X_train_enc_roberta = tokenizer(
    X_train_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)
X_test_enc_roberta = tokenizer(
    X_test_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)


In [None]:
# Convert to torch tensors
train_dataset_roberta = TensorDataset(
    X_train_enc_roberta['input_ids'],
    X_train_enc_roberta['attention_mask'],
    torch.tensor(y_train_roberta)
)

test_dataset_roberta = TensorDataset(
    X_test_enc_roberta['input_ids'],
    X_test_enc_roberta['attention_mask'],
    torch.tensor(y_test_roberta)
)


In [None]:
batch_size = 32

train_loader = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


#### Evaluation

In [None]:
model.eval()
preds_roberta, truths_roberta = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_roberta.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_roberta.extend(b_labels.cpu().numpy())

print(classification_report(truths_roberta, preds_roberta, target_names=['Not Sarcastic', 'Sarcastic']))

cm_roberta = confusion_matrix(truths_roberta, preds_roberta)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_roberta, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - RoBERTa Sarcasm Detection')
plt.tight_layout()
plt.show()


## With context


#### Dataset 2 preparation


In [None]:
df['input_text'] = df['response'] + ' [SEP] ' + 'Context :' + df['context']

# Check the result
print(df[['input_text', 'label']].head())

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'input_text': X_test, 'label': y_test})

print("Train and test splits saved!")

### Transformers based models : RoBERTa

#### Training


In [None]:
X_train_roberta = train_df['input_text'].astype(str).tolist()
y_train_roberta = train_df['label'].tolist()
X_test_roberta = test_df['input_text'].astype(str).tolist()
y_test_roberta = test_df['label'].tolist()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
X_train_enc_roberta = tokenizer(
    X_train_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)
X_test_enc_roberta = tokenizer(
    X_test_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)


In [None]:
# Convert to torch tensors
train_dataset_roberta = TensorDataset(
    X_train_enc_roberta['input_ids'],
    X_train_enc_roberta['attention_mask'],
    torch.tensor(y_train_roberta)
)

test_dataset_roberta = TensorDataset(
    X_test_enc_roberta['input_ids'],
    X_test_enc_roberta['attention_mask'],
    torch.tensor(y_test_roberta)
)


In [None]:
batch_size = 32

train_loader = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


#### Evaluation

In [None]:
model.eval()
preds_roberta, truths_roberta = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_roberta.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_roberta.extend(b_labels.cpu().numpy())

print(classification_report(truths_roberta, preds_roberta, target_names=['Not Sarcastic', 'Sarcastic']))

cm_roberta = confusion_matrix(truths_roberta, preds_roberta)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_roberta, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - RoBERTa Sarcasm Detection')
plt.tight_layout()
plt.show()
