### Managing imports

In [None]:
!pip install tensorflow transformers torch datasets hf_xet

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import torch
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm



#### Loading dataset 1

In [None]:
# Load the JSONL file (lines=True is key!)
df1 = pd.read_json('../Dataset/sarcasm_detection_shared_task_reddit_training.jsonl', lines=True)

# Map label: SARCASM -> 1, NOT_SARCASM -> 0
df1['label'] = df1['label'].replace({'SARCASM': 1, 'NOT_SARCASM': 0}).astype(int)

# Convert context list to string (join with ' ||| ')
df1['context'] = df1['context'].apply(lambda x: ' ||| '.join(x) if isinstance(x, list) else '')

# Save to CSV
df1.to_csv('../Dataset/sarcasm_detection_training_data.csv', index=False, columns=['label', 'response', 'context'])
df1


#### Dataset 1 preparation


In [None]:
df1['input_text'] = df1['context'] + ' [SEP] ' + df1['response']

# Check the result
print(df1[['input_text', 'label']].head())

### Transformers based models : RoBERTa

#### Training


In [None]:
X_train_roberta = train_df1['input_text'].astype(str).tolist()
y_train_roberta = train_df1['label'].tolist()
X_test_roberta = test_df1['input_text'].astype(str).tolist()
y_test_roberta = test_df1['label'].tolist()

In [None]:
X_train_enc_roberta = tokenizer(
    X_train_roberta,
    padding='max_length',
    truncation=True,
    max_length=64,
    return_tensors='pt'
)
X_test_enc_roberta = tokenizer(
    X_test_roberta,
    padding='max_length',
    truncation=True,
    max_length=64,
    return_tensors='pt'
)


In [None]:
# Convert to torch tensors
train_dataset_roberta = TensorDataset(
    X_train_enc_roberta['input_ids'],
    X_train_enc_roberta['attention_mask'],
    torch.tensor(y_train_roberta)
)

test_dataset_roberta = TensorDataset(
    X_test_enc_roberta['input_ids'],
    X_test_enc_roberta['attention_mask'],
    torch.tensor(y_test_roberta)
)


In [None]:
batch_size = 16

train_loader = DataLoader(train_dataset_roberta, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_roberta, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


#### Evaluation

In [None]:
model.eval()
preds, truths = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths.extend(b_labels.cpu().numpy())

print(classification_report(truths, preds, target_names=['Not Sarcastic', 'Sarcastic']))

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - RoBERTa Sarcasm Detection')
plt.tight_layout()
plt.show()


### Onto Token-free models : ByT5

#### Training

In [None]:
train_df1['input_text'] = train_df1['input_text'].fillna("").astype(str)
test_df1['input_text'] = test_df1['input_text'].fillna("").astype(str)

X_train_byt5 = train_df1['input_text'].tolist()
y_train_byt5 = train_df1['label'].tolist()
X_test_byt5 = test_df1['input_text'].tolist()
y_test_byt5 = test_df1['label'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,         # You can adjust for your dataset
        return_tensors='pt'
    )

X_train_enc_byt5 = tokenize(X_train_byt5)
X_test_enc_byt5 = tokenize(X_test_byt5)

In [None]:
train_dataset_byt5 = TensorDataset(
    X_train_enc_byt5['input_ids'],
    X_train_enc_byt5['attention_mask'],
    torch.tensor(y_train_byt5)
)

test_dataset_byt5 = TensorDataset(
    X_test_enc_byt5['input_ids'],
    X_test_enc_byt5['attention_mask'],
    torch.tensor(y_test_byt5)
)

In [None]:
batch_size = 16

train_loader = DataLoader(train_dataset_byt5, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_byt5, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("google/byt5-small", num_labels=2)
model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

#### Evaluation

In [None]:
model.eval()
preds, truths = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths.extend(b_labels.cpu().numpy())

print(classification_report(truths, preds, target_names=['Not Sarcastic', 'Sarcastic']))