# Fine-tune LLMs to do Sarcasm interpretations

In [54]:
pip install nltk comet-ml emoji unbabel-comet datasets evaluate rouge_score



In [55]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/SarcasmNLP')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# model_choice = 'gpt2'
model_choice = 'flan-t5-base'
# model_choice = 't5-base'
classifier_model_choice = 'bert-base-uncased'


In [57]:
mode = 'train'
# mode = 'evaluate'

In [5]:
dataset_ = 'iSarcasm'
# dataset_ = 'GPT-4o-mini'

## Load Model

### Classfication Model: bert-base-uncased

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification

if mode == 'train':
  classifier_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  classifier_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
else:
  classifier_tokenizer = BertTokenizer.from_pretrained(f'./results/{classifier_model_choice}/my_model')
  classifier_model = BertForSequenceClassification.from_pretrained(f'./results/{classifier_model_choice}/my_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Interpretation Models:

### GPT-2 small

In [59]:
if model_choice == 'gpt2':
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
  if mode == 'train':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
  else:
    tokenizer = GPT2Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = GPT2LMHeadModel.from_pretrained(f'./results/{model_choice}/my_model')


### Google FLAN-T5-base

In [60]:
if model_choice == 'flan-t5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


### T5-base

In [61]:
if model_choice == 't5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


## Load Data

In [91]:
import pandas as pd
def load_data(dataset):
  # if dataset == 'iSarcasm':
  #   return pd.read_csv('iSarcasm_pairs.tsv', sep='\t')
  # else:
  #   return pd.read_csv('GPT_pairs.tsv', sep='\t')
  dataset = pd.read_csv('combined_df.tsv', sep='\t')
  evaluation_dataset = pd.read_csv('iSarcasm_pairs_test.tsv', sep='\t')
  dataset['Translation'] = dataset['Translation'].fillna('')
  evaluation_dataset['Translation'] = evaluation_dataset['Translation'].fillna('')
  return dataset, evaluation_dataset

df, df_eval = load_data(dataset_)

### Classification model:

### Intitialization

In [92]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [93]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [65]:
def encode_texts(texts, targets=None, tokenizer=None):
    if targets is None:  # For single input (sarcasm detection)
        return classifier_tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

In [66]:
import torch
from torch.utils.data import Dataset

# Custom dataset class for sarcasm classification
class SarcasmClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = encode_texts(texts.astype(str))
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.texts['input_ids'][idx],
            'attention_mask': self.texts['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = SarcasmClassificationDataset(train_df['Sarcastic'], train_df['IsSarcastic'])
valid_dataset = SarcasmClassificationDataset(valid_df['Sarcastic'], valid_df['IsSarcastic'])
test_dataset = SarcasmClassificationDataset(test_df['Sarcastic'], test_df['IsSarcastic'])

### Training

In [67]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results/{classifier_model_choice}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy="epoch"
)

# Initialize Trainer for sarcasm classification
trainer = Trainer(
    model=classifier_model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)



In [68]:
if mode == 'train':
  # Train the sarcasm classifier
  trainer.train()
  # Save the model
  classifier_model.save_pretrained(f'./results/{classifier_model_choice}/my_model')
  classifier_tokenizer.save_pretrained(f'./results/{classifier_model_choice}/my_model')

Epoch,Training Loss,Validation Loss
1,0.3666,0.337337
2,0.3025,0.407219
3,0.2173,0.571988


In [70]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.33733686804771423, 'eval_runtime': 0.8989, 'eval_samples_per_second': 632.977, 'eval_steps_per_second': 80.095, 'epoch': 3.0}


### Interpretation Model:

### Initialization

In [116]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [117]:
add_prefix = lambda x: "Provide straightforward, literal translations for this sarcastic comment: " + str(x)

train_df['Input'] = train_df['Sarcastic'].apply(add_prefix)
valid_df['Input'] = valid_df['Sarcastic'].apply(add_prefix)
test_df['Input'] = test_df['Sarcastic'].apply(add_prefix)


In [118]:
train_df['Translation'] = train_df['Translation'].fillna("")
valid_df['Translation'] = valid_df['Translation'].fillna("")
test_df['Translation'] = test_df['Translation'].fillna("")

def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [119]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

In [120]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, sources):
        self.encodings = encodings
        self.sources = sources

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['source'] = self.sources[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings, list(train_df['Sarcastic']))
valid_dataset = SarcasmTranslationDataset(valid_encodings, list(valid_df['Sarcastic']))
test_dataset = SarcasmTranslationDataset(test_encodings, list(test_df['Sarcastic']))

## Prepare Metrics

For colab, need to install additional packages (already in conda environment.yml)

In [121]:
import evaluate

# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
comet = evaluate.load("comet")  # Ensure COMET is installed and properly configured
chrf = evaluate.load("chrf")  # ChrF metric



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [122]:

def compute_metrics(decoded_preds, decoded_labels, sources):
    # # Get predictions and labels
    # predictions = pred.predictions[0]
    # labels = pred.label_ids
    # # if isinstance(predictions, list) and isinstance(predictions[0], list):
    # #     predictions = [pred[0] for pred in predictions]

    # # Decode predictions and labels
    # decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # ChrF
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # COMET
    comet_result = comet.compute(predictions=decoded_preds, references=decoded_labels, sources=sources)



    # Combine the results, including all ROUGE scores
    metrics = {
        "bleu": bleu_result["bleu"],
        "chrf": chrf_result["score"],
        "comet": comet_result.get("mean_score", None),
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "rougeLsum": rouge_result.get("rougeLsum", None),
    }

    return metrics

## Training

In [123]:
model = model.to(device)

In [124]:
model.name_or_path

'google/flan-t5-base'

In [125]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    save_steps=10000,
    save_total_limit=1,              # keep only the most recent checkpoint
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="none",                # Disable wandb logging
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics=compute_metrics, # TODO change it as batch evaluation
)




In [126]:
if mode == 'train':
  trainer.train()
  # Save the model
  model.save_pretrained(f'./results/{model_choice}/my_model')
  tokenizer.save_pretrained(f'./results/{model_choice}/my_model')

Epoch,Training Loss,Validation Loss
1,0.1504,0.151438
2,0.1343,0.152384
3,0.1447,0.152622
4,0.1487,0.152599
5,0.1598,0.152677
6,0.1348,0.152775
7,0.1147,0.153192
8,0.1243,0.153323
9,0.1327,0.153351
10,0.1239,0.153373


## Evaluation

In [127]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.15337307751178741, 'eval_runtime': 1.8435, 'eval_samples_per_second': 169.788, 'eval_steps_per_second': 21.698, 'epoch': 10.0}


In [130]:
predictions, labels, _ = trainer.predict(test_dataset)

In [131]:
logits = torch.tensor(predictions[0])

# Select the token with the highest probability
predicted_token_ids = torch.argmax(logits, dim=-1)

# Decode the predicted tokens
decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

In [132]:
sources = [test_dataset[i]['source'] for i in range(len(test_dataset))]

In [133]:
metrics = compute_metrics(decoded_preds, decoded_labels, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.29065480751157347, 'chrf': 54.2714459428979, 'comet': 0.6346741736696931, 'rouge1': 0.5952492289752125, 'rouge2': 0.3542191810498126, 'rougeL': 0.5883517830192877, 'rougeLsum': 0.588485333106421}


In [134]:
metrics = compute_metrics(sources, decoded_labels, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.05456304383967352, 'chrf': 30.46879140238477, 'comet': 0.6295675352549019, 'rouge1': 0.23279939827351492, 'rouge2': 0.08526930014033529, 'rougeL': 0.20343531537645942, 'rougeLsum': 0.20301373500470155}


In [135]:
metrics = compute_metrics(decoded_preds, sources, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.056556260320132654, 'chrf': 25.74152360437061, 'comet': 0.4818565776934639, 'rouge1': 0.25204569006912597, 'rouge2': 0.09119669403102376, 'rougeL': 0.22355581225684276, 'rougeLsum': 0.2237242007879779}


In [136]:
# def inference(input_text):
#   if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
#     input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
#   # input_text = "Provide straightforward, literal translations for this sarcastic comment: I just absolutely LOVE how I've got to work outside for the next 3 days in the heatwave."

#   input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
#   output_ids = model.generate(input_ids)
#   decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

#   return decoded_output

def classify_sarcasm(text):
    inputs = classifier_tokenizer(text, return_tensors="pt").to(device)  # Move inputs to the same device as the model
    # Move the model to the same device as the inputs
    classifier_model.to(device)
    outputs = classifier_model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction == 1

def generate_interpretation(input_text):
    if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
      input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return decoded_output


In [137]:
# Test on a few examples
test_examples = df_eval.sample(n=20, random_state=42)
for index, row in test_examples.iterrows():
    is_sarcastic = classify_sarcasm(row['Sarcastic'])
    print(f"Original: {row['Sarcastic']} {is_sarcastic}")
    if is_sarcastic:
        interpretation = generate_interpretation(row['Sarcastic'])
        print(f"Interpretation: {interpretation}\n")
        print(f"Ground truth: {row['Translation']}\n")
    else:
        print("Not Sarcastic\n")

Original: i’m dying False
Not Sarcastic

Original: Mental shoot out ends. Silent pause. Sirens start 🚨 Always so perfectly timed 🚔🚓👍🏽 #LineOfDuty #LineofDuty6 #LOD6 #LOD False
Not Sarcastic

Original: using loops, building an idea on that loop, and then deleting the loop has been a fun little exercise recently True




Interpretation: I would say "Using loops, building an idea on that loop, and then deleting

Ground truth: 

Original: "Alexa add small bananas to the shopping list." I've added smooth armours to your shopping list." Gee thanks Alexa! #alexa False
Not Sarcastic

Original: Just rediscovered that my nose is always in my field of vision so I will unfortunately be dropping out as I cannot continue my assigned readings in peace False
Not Sarcastic

Original: I am starting off my day by watching a Dragons Den special on Deborah Meaden and eating a Pot Noodle. My life is perfect. False
Not Sarcastic

Original: not to brag, but my thesis topic was so good that someone else published on it three months ago 🙃 False
Not Sarcastic

Original: @miiniigun I feel that 100%, my anxiety stops me from talking to so many people. I've let so many friendships die because I can't push myself to talk to them. False
Not Sarcastic

Original: Day 38 of quarantine is an interesting time to get a new drum kit, neig

In [139]:
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f" \nsrc: {src} \nisSarcastic: {classify_sarcasm(src)} \ntranslation: {generate_interpretation(src)} \nground_truth: {truth}")


 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
isSarcastic: True 
translation: Finishing snacks before dinner is not healthy. 
ground_truth: Eating snacks before dinner is not a good decision for your health.


### Evaluate on GPT-4o-mini pairs

In [143]:
df2, eval_data = load_data("GPT-4o-mini")
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [None]:
gpt_pairs = SarcasmTranslationDataset(tokenize_data(df2))

In [None]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

In [147]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.15337307751178741, 'eval_runtime': 1.5312, 'eval_samples_per_second': 204.418, 'eval_steps_per_second': 26.124, 'epoch': 10.0}
