## BERT-DS2

### Managing imports

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm



## Without context

#### Loading dataset 2

In [None]:
# Only read the first 4400 rows and specific columns
cols_to_use = ['label', 'comment', 'parent_comment']

df = pd.read_csv('../Dataset/train-balanced-sarcasm.csv', usecols=cols_to_use, nrows=4400)

# Rename columns
df = df.rename(columns={
    'comment': 'response',
    'parent_comment': 'context'
})

df

  df1['label'] = df1['label'].replace({'SARCASM': 1, 'NOT_SARCASM': 0}).astype(int)


#### Dataset 2 preparation


In [None]:
df['input_text'] = df['response']

# Check the result
print(df[['input_text', 'label']].head())
df

                                          input_text  label
0  Yeah I mean there's only one gender anyways, w...      1
1  Sounds like you don't like science, you theist...      1
2  Ofc play them in try mode, Blizzard were so ge...      1
3  I don't understand, Reddit told me that Hillar...      1
4  yeh, they're the reigning triple premiers, why...      1


Unnamed: 0,label,response,context,input_text
0,1,"Yeah I mean there's only one gender anyways, w...",LPT: If you're worried about hurting someone's...,"Yeah I mean there's only one gender anyways, w..."
1,1,"Sounds like you don't like science, you theist...",Promotional images for some guy's Facebook page,"Sounds like you don't like science, you theist..."
2,1,"Ofc play them in try mode, Blizzard were so ge...",My friends won't play Dota2; I won't play LoL;...,"Ofc play them in try mode, Blizzard were so ge..."
3,1,"I don't understand, Reddit told me that Hillar...",Poll: Convention boosts Clinton to 11-point le...,"I don't understand, Reddit told me that Hillar..."
4,1,"yeh, they're the reigning triple premiers, why...",Wayne Ludbey: Jordan Lewis has the ultimate co...,"yeh, they're the reigning triple premiers, why..."
...,...,...,...,...
4395,0,well you could've been adulting if you hadn't ...,8-9ft man found in ancient indian burial mound,well you could've been adulting if you hadn't ...
4396,0,Also they'll have to join the euro,Second Scottish independence referendum 'on th...,Also they'll have to join the euro
4397,0,plot: AI assists a cyborg in freelance investi...,Pinoy Cyborg by James Simmons,plot: AI assists a cyborg in freelance investi...
4398,0,Some airlines proposed this but too much publi...,The logic here is flawless!,Some airlines proposed this but too much publi...


In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'input_text': X_test, 'label': y_test})


print("Train and test splits saved!")

Train and test splits saved!


### Transformers based models : RoBERTa

#### Training


In [None]:
X_train_roberta = train_df['input_text'].astype(str).tolist()
y_train_roberta = train_df['label'].tolist()
X_test_roberta = test_df['input_text'].astype(str).tolist()
y_test_roberta = test_df['label'].tolist()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [14]:
X_train_enc_roberta = tokenizer(
    X_train_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)
X_test_enc_roberta = tokenizer(
    X_test_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)


In [15]:
# Convert to torch tensors
train_dataset_roberta = TensorDataset(
    X_train_enc_roberta['input_ids'],
    X_train_enc_roberta['attention_mask'],
    torch.tensor(y_train_roberta)
)

test_dataset_roberta = TensorDataset(
    X_test_enc_roberta['input_ids'],
    X_test_enc_roberta['attention_mask'],
    torch.tensor(y_test_roberta)
)


In [None]:
batch_size = 32

train_loader = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Training loop

In [18]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


  return forward_call(*args, **kwargs)
  7%|▋         | 15/220 [04:03<55:27, 16.23s/it] 


KeyboardInterrupt: 

#### Evaluation

In [None]:
model.eval()
preds_roberta, truths_roberta = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_roberta.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_roberta.extend(b_labels.cpu().numpy())

print(classification_report(truths_roberta, preds_roberta, target_names=['Not Sarcastic', 'Sarcastic']))

cm_roberta = confusion_matrix(truths_roberta, preds_roberta)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_roberta, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - RoBERTa Sarcasm Detection')
plt.tight_layout()
plt.show()


## With context


#### Dataset 2 preparation


In [None]:
df['input_text'] = df['response'] + ' [SEP] ' + 'Context :' + df['context']

# Check the result
print(df[['input_text', 'label']].head())

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'input_text': X_test, 'label': y_test})

print("Train and test splits saved!")

### Transformers based models : RoBERTa

#### Training


In [None]:
X_train_roberta = train_df['input_text'].astype(str).tolist()
y_train_roberta = train_df['label'].tolist()
X_test_roberta = test_df['input_text'].astype(str).tolist()
y_test_roberta = test_df['label'].tolist()

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
X_train_enc_roberta = tokenizer(
    X_train_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)
X_test_enc_roberta = tokenizer(
    X_test_roberta,
    padding='max_length',
    truncation=True,
    max_length=110,
    return_tensors='pt'
)


In [None]:
# Convert to torch tensors
train_dataset_roberta = TensorDataset(
    X_train_enc_roberta['input_ids'],
    X_train_enc_roberta['attention_mask'],
    torch.tensor(y_train_roberta)
)

test_dataset_roberta = TensorDataset(
    X_test_enc_roberta['input_ids'],
    X_test_enc_roberta['attention_mask'],
    torch.tensor(y_test_roberta)
)


In [None]:
batch_size = 32

train_loader = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


#### Evaluation

In [None]:
model.eval()
preds_roberta, truths_roberta = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_roberta.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_roberta.extend(b_labels.cpu().numpy())

print(classification_report(truths_roberta, preds_roberta, target_names=['Not Sarcastic', 'Sarcastic']))

cm_roberta = confusion_matrix(truths_roberta, preds_roberta)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_roberta, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - RoBERTa Sarcasm Detection')
plt.tight_layout()
plt.show()
