# Fine-tune LLMs to do Sarcasm interpretations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/SarcasmNLP')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# model_choice = 'gpt2'
model_choice = 'flan-t5-base'
# model_choice = 't5-base'


In [3]:
# mode = 'train'
mode = 'evaluate'

In [4]:
dataset_ = 'iSarcasm'
# dataset_ = 'GPT-4o-mini'

## Load Model

### GPT-2 small

In [5]:
if model_choice == 'gpt2':
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
  if mode == 'train':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
  else:
    tokenizer = GPT2Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = GPT2LMHeadModel.from_pretrained(f'./results/{model_choice}/my_model')


### Google FLAN-T5-base

In [6]:
if model_choice == 'flan-t5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


### T5-base

In [7]:
if model_choice == 't5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


## Load Data

In [8]:
import pandas as pd
def load_data(dataset):
  if dataset == 'iSarcasm':
    return pd.read_csv('iSarcasm_pairs.tsv', sep='\t')
  else:
    return pd.read_csv('GPT_pairs.tsv', sep='\t')

df = load_data(dataset_)

In [9]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [10]:
add_prefix = lambda x: "Provide straightforward, literal translations for this sarcastic comment: " + x

train_df['Input'] = train_df['Sarcastic'].apply(add_prefix)
valid_df['Input'] = valid_df['Sarcastic'].apply(add_prefix)
test_df['Input'] = test_df['Sarcastic'].apply(add_prefix)


In [11]:

def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [12]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

## Prepare Metrics

For colab, need to install additional packages (already in conda environment.yml)

In [13]:
!pip install datasets evaluate



In [14]:
!pip install rouge_score unbabel-comet #restart maybe needed



In [15]:
import evaluate

# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
comet = evaluate.load("comet")  # Ensure COMET is installed and properly configured
chrf = evaluate.load("chrf")  # ChrF metric



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [16]:

def compute_metrics(pred):
    # Get predictions and labels
    predictions = pred.predictions[0]
    labels = pred.label_ids
    # if isinstance(predictions, list) and isinstance(predictions[0], list):
    #     predictions = [pred[0] for pred in predictions]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # ChrF
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    #COMET
    comet_result = comet.compute(predictions=decoded_preds, references=decoded_labels)



    # Combine the results, including all ROUGE scores
    metrics = {
        "bleu": bleu_result["bleu"],
        "chrf": chrf_result["score"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "rougeLsum": rouge_result.get("rougeLsum", None),
        "comet": comet_result.get("score", None),
    }

    return metrics

## Training

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [18]:
model.name_or_path

'./results/flan-t5-base/my_model'

In [19]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    save_steps=10000,
    save_total_limit=1,              # keep only the most recent checkpoint
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics=compute_metrics, # TODO change it as batch evaluation
)




In [20]:
if mode == 'train':
  trainer.train()
  # Save the model
  model.save_pretrained(f'./results/{model_choice}/my_model')
  tokenizer.save_pretrained(f'./results/{model_choice}/my_model')

In [21]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.7217645049095154, 'eval_model_preparation_time': 0.0128, 'eval_runtime': 3.7523, 'eval_samples_per_second': 23.186, 'eval_steps_per_second': 2.932}


In [22]:
def inference(input_text):
  if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
    input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
  # input_text = "Provide straightforward, literal translations for this sarcastic comment: I just absolutely LOVE how I've got to work outside for the next 3 days in the heatwave."

  input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
  output_ids = model.generate(input_ids)
  decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  return decoded_output

In [23]:
for i in range(5):
    print(f"{i}, \nsrc: {test_df['Sarcastic'].iloc[i]} \ntranslation: {inference(test_df['Sarcastic'].iloc[i])} \nground_truth: {test_df['Translation'].iloc[i]}")



0, 
src: @Mythical So worried about him. But if you're looking to save him, based on the topography I'd say it's somewhere on the east coast. Perhaps the Carolinas? 
translation: I'm not worried about him. 
ground_truth: I suppose I would have simply stated that we all know he is not kidnapped, but it seems like he's just in the woods in North Carolina. 
1, 
src: Damn, imagine being vaxxed and then getting a cold and then losing your taste and smell.

Oh wait. 
translation: I hate being vaxxed and then getting a cold and then losing my taste and 
ground_truth: To make it non-sarcastic I could have said, "I hate having a cold and then losing my taste and smell."
2, 
src: if you see me crying in the self-service car wash in my rosati's uniform, no you didn't ❤️ 
translation: I'm not crying in the self-service car wash in my rosati' 
ground_truth: No.
3, 
src: I miss walking up 3 flights of stairs for class and having to catch my breath in the bathroom 😩 
translation: I miss walking up 3 

In [24]:
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f"{i}, \nsrc: {src} \ntranslation: {inference(src)} \nground_truth: {truth}")


4, 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
translation: I would say that eating snacks before dinner is not healthy. 
ground_truth: Eating snacks before dinner is not a good decision for your health.


### Evaluate on GPT-4o-mini pairs

In [25]:
df2 = load_data("GPT-4o-mini")
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [26]:
gpt_pairs = SarcasmTranslationDataset(tokenize_data(df2))

In [27]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

In [28]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.7217645049095154, 'eval_model_preparation_time': 0.0128, 'eval_runtime': 1.2685, 'eval_samples_per_second': 68.584, 'eval_steps_per_second': 8.672}
