## canine-DS2

### Managing imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm



## Without context

#### Loading dataset 2

In [None]:
# Only read the first 4400 rows and specific columns
cols_to_use = ['label', 'comment', 'parent_comment']

df = pd.read_csv('../Dataset/train-balanced-sarcasm.csv', usecols=cols_to_use, nrows=4400)

# Rename columns
df = df.rename(columns={
    'comment': 'response',
    'parent_comment': 'context'
})

df



#### Preparing dataset 2

In [None]:
# Load the JSONL file (lines=True is key!)
df = pd.read_json('Dataset/sarcasm_detection_shared_task_reddit_training.jsonl', lines=True)

# Map label: SARCASM -> 1, NOT_SARCASM -> 0
df['label'] = df['label'].replace({'SARCASM': 1, 'NOT_SARCASM': 0}).astype(int)

# Convert context list to string (join with ' ||| ')
df['context'] = df['context'].apply(lambda x: ' ||| '.join(x) if isinstance(x, list) else '')




In [None]:
df['input_text'] = df['response']

# Check the result
print(df[['input_text', 'label']].head())

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df1 = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df1 = pd.DataFrame({'input_text': X_test, 'label': y_test})


### Onto Token-free models : Canine

#### Training

In [None]:
train_df1['input_text'] = train_df1['input_text'].fillna("").astype(str)
test_df1['input_text'] = test_df1['input_text'].fillna("").astype(str)

X_train_canine = train_df1['input_text'].tolist()
y_train_canine = train_df1['label'].tolist()
X_test_canine = test_df1['input_text'].tolist()
y_test_canine = test_df1['label'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/canine-s")

def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=115,         # You can adjust for your dataset
        return_tensors='pt'
    )

X_train_enc_canine = tokenize(X_train_canine)
X_test_enc_canine = tokenize(X_test_canine)

In [None]:
train_dataset_canine = TensorDataset(
    X_train_enc_canine['input_ids'],
    X_train_enc_canine['attention_mask'],
    torch.tensor(y_train_canine)
)

test_dataset_canine = TensorDataset(
    X_test_enc_canine['input_ids'],
    X_test_enc_canine['attention_mask'],
    torch.tensor(y_test_canine)
)

In [None]:
batch_size = 16

train_loader = DataLoader(train_dataset_canine, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_canine, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("google/canine-s", num_labels=2)
model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

#### Evaluation

In [None]:
model.eval()
preds_canine, truths_canine = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_canine.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_canine.extend(b_labels.cpu().numpy())

print(classification_report(truths_canine, preds_canine, target_names=['Not Sarcastic', 'Sarcastic']))

cm_canine = confusion_matrix(truths_canine, preds_canine)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_canine, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - ByT5 Sarcasm Detection Dataset 1')
plt.tight_layout()
plt.show()

## With context

#### Preparing dataset 2

In [None]:
df['input_text'] = df['response'] + ' [SEP] ' + 'Context:' + df['context']

# Check the result
print(df[['input_text', 'label']].head())

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df['input_text'], df['label'],
    test_size=0.2, random_state=42, stratify=df['label']
)

train_df = pd.DataFrame({'input_text': X_train, 'label': y_train})
test_df = pd.DataFrame({'input_text': X_test, 'label': y_test})

print("Train and test splits saved!")

#### Training

In [None]:
train_df['input_text'] = train_df['input_text'].fillna("").astype(str)
test_df['input_text'] = test_df['input_text'].fillna("").astype(str)

X_train_byt5 = train_df['input_text'].tolist()
y_train_byt5 = train_df['label'].tolist()
X_test_byt5 = test_df['input_text'].tolist()
y_test_byt5 = test_df['label'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/canine-s")

def tokenize(texts):
    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=115,         # You can adjust for your dataset
        return_tensors='pt'
    )

X_train_enc_canine = tokenize(X_train_canine)
X_test_enc_canine = tokenize(X_test_canine)

In [None]:
train_dataset_canine = TensorDataset(
    X_train_enc_canine['input_ids'],
    X_train_enc_canine['attention_mask'],
    torch.tensor(y_train_canine)
)

test_dataset_canine = TensorDataset(
    X_test_enc_canine['input_ids'],
    X_test_enc_canine['attention_mask'],
    torch.tensor(y_test_canine)
)

In [None]:
batch_size = 16

train_loader = DataLoader(train_dataset_canine, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset_canine, batch_size=batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained("google/canine-s", num_labels=2)
model = model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

#### Training loop


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
        loss = outputs.loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_loader):.4f}")

#### Evaluation

In [None]:
model.eval()
preds_canine, truths_canine = [], []
with torch.no_grad():
    for batch in test_loader:
        b_input_ids, b_attn_mask, b_labels = [x.to(device) for x in batch]
        outputs = model(input_ids=b_input_ids, attention_mask=b_attn_mask)
        logits = outputs.logits
        preds_canine.extend(torch.argmax(logits, dim=1).cpu().numpy())
        truths_canine.extend(b_labels.cpu().numpy())

print(classification_report(truths_canine, preds_canine, target_names=['Not Sarcastic', 'Sarcastic']))

cm_canine = confusion_matrix(truths_canine, preds_canine)
labels = ['Not Sarcastic', 'Sarcastic']

plt.figure(figsize=(6, 5))
sns.heatmap(cm_canine, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - ByT5 Sarcasm Detection Dataset 1')
plt.tight_layout()
plt.show()