# Fine-tune LLMs to do Sarcasm interpretations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Context_Aware')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#model_choice = 'gpt2'
model_choice = 'flan-t5-base'
# model_choice = 't5-base'


In [3]:
mode = 'train'
#mode = 'evaluate'

In [4]:
#dataset_ = 'iSarcasm'
# dataset_ = 'GPT-4o-mini'
dataset_ = 'combined_train_df'

## Load Model

### GPT-2 small

In [5]:
if model_choice == 'gpt2':
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
  if mode == 'train':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
  else:
    tokenizer = GPT2Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = GPT2LMHeadModel.from_pretrained(f'./results/{model_choice}/my_model')


### Google FLAN-T5-base

In [6]:
if model_choice == 'flan-t5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### T5-base

In [7]:
if model_choice == 't5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


## Load Data

In [8]:
from sklearn.model_selection import train_test_split

import pandas as pd
def load_data(dataset):
  if dataset == 'iSarcasm':
    return pd.read_csv('iSarcasm_pairs.tsv', sep='\t')
  if dataset == 'GPT-4o-mini':
    return pd.read_csv('GPT_pairs.tsv', sep='\t')

  else:
    data = pd.read_csv('combined_df.tsv', sep='\t')
    data = data.drop(data.columns[0], axis=1)  # Drop the first column
    return data

df = load_data(dataset_)

In [9]:

df.head()

Unnamed: 0,Sarcastic,Translation
0,You know the wolves match is boring when you'r...,"This match isn't too interesting, we are using..."
1,How lovely! The same old complaints brought up...,Hearing the same complaints repeatedly is frus...
2,"Wow, you must be proud of your talent for poin...",Focusing on negatives can be frustrating.
3,It's soooo great that I've taken pain medicati...,"I took pain medicine for my back, but it still..."
4,"Yes, because what we really need is more burea...",I believe we do not need additional bureaucrat...


In [10]:
from sklearn.model_selection import train_test_split





train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [11]:
add_prefix = lambda x: "Provide straightforward, literal translations for this sarcastic comment: " + x

train_df['Input'] = train_df['Sarcastic'].apply(add_prefix)
valid_df['Input'] = valid_df['Sarcastic'].apply(add_prefix)
test_df['Input'] = test_df['Sarcastic'].apply(add_prefix)


In [12]:
'''tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)'''


tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(df):
    inputs = tokenizer(
        df['Input'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    targets = tokenizer(
        df['Translation'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Set padding tokens in targets to -100 to ignore them in loss calculation
    targets['input_ids'][targets['input_ids'] == tokenizer.pad_token_id] = -100

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [13]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

## Prepare Metrics

For colab, need to install additional packages (already in conda environment.yml)

In [14]:
!pip install datasets evaluate



In [15]:
!pip install rouge_score unbabel-comet #restart maybe needed



In [16]:
import evaluate

# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
comet = evaluate.load("comet")  # Ensure COMET is installed and properly configured
chrf = evaluate.load("chrf")  # ChrF metric



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [17]:

def compute_metrics(pred):
    # Get predictions and labels
    predictions = pred.predictions[0]
    labels = pred.label_ids
    # if isinstance(predictions, list) and isinstance(predictions[0], list):
    #     predictions = [pred[0] for pred in predictions]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # ChrF
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    #COMET
    comet_result = comet.compute(predictions=decoded_preds, references=decoded_labels)



    # Combine the results, including all ROUGE scores
    metrics = {
        "bleu": bleu_result["bleu"],
        "chrf": chrf_result["score"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "rougeLsum": rouge_result.get("rougeLsum", None),
        "comet": comet_result.get("score", None),
    }

    return metrics

## Training

In [18]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [19]:
model.name_or_path

'google/flan-t5-base'

In [20]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback


'''device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up Training Arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    eval_strategy="epoch",
    learning_rate=3e-5,                # Lower learning rate for stability
    save_steps=5000,
    save_total_limit=2,                # Keep only recent 2 checkpoints
    per_device_train_batch_size=32,  # Reduced batch size for training
    per_device_eval_batch_size=32,  # Reduced batch size for evaluation
    num_train_epochs=10,                # Reduced epochs to avoid overfitting
    weight_decay=0.05,                 # Increase weight decay for regularization
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",
    lr_scheduler_type="linear",        # Linear decay scheduler
    warmup_steps=500,                  # Warmup for initial stabilization
    load_best_model_at_end=True,  # Keep best model (early stopping benefit)
    save_strategy="epoch",
    # Gradient Accumulation
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    # Evaluation strategy
    eval_accumulation_steps=2       # Accumulate eval results over 2 steps
)

# Early Stopping Callback
# Stop training if validation loss doesn’t improve after a set number of checks
#early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# Create Trainer instance with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    #compute_metrics=compute_metrics,
    #callbacks=[early_stopping]         # Add early stopping callback
)'''

# Set training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    eval_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate= 9e-5, #1e-4,#2e-5,              # learning rate
    save_steps=10000,
    save_total_limit=1,              # keep only the most recent checkpoint
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay= 0.01, #0.05,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100, #10,
    report_to = 'none',
    lr_scheduler_type= "linear", #"cosine",   #scheduler
    #warmup_steps=500,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics=compute_metrics,  # Add compute_metrics if you have it defined
)




In [21]:
if mode == 'train':
  trainer.train()
  # Save the model
  model.save_pretrained(f'./results/{model_choice}/my_model')
  tokenizer.save_pretrained(f'./results/{model_choice}/my_model')

Epoch,Training Loss,Validation Loss
1,2.2579,2.088128
2,1.9348,2.041402
3,1.7677,2.024657
4,1.5632,2.04517
5,1.4113,2.075633
6,1.3223,2.10992
7,1.2475,2.155303
8,1.1525,2.182495
9,1.1164,2.212116
10,1.043,2.217936


In [22]:
'''# hyperparameter_tuning.py
!pip install ray
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModelForCausalLM, AutoTokenizer
from ray import tune
import os


# Function to define the model and training configuration
def train_model(config, checkpoint_dir=None):
    model = AutoModelForCausalLM.from_pretrained(model_choice)
    # ... (your code to adjust dropout rates if needed)

    training_args = TrainingArguments(
        output_dir=os.path.join(checkpoint_dir, "results"),
        eval_strategy="epoch",
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
        per_device_train_batch_size=config["per_device_train_batch_size"],
        # ... (other training arguments)
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate the model
    train_results = trainer.train(resume_from_checkpoint=checkpoint_dir)
    eval_results = trainer.evaluate()

    # Report the evaluation results to Ray Tune
    tune.report(eval_loss=eval_results["eval_loss"])


# Define the hyperparameter search space
search_space = {
    "learning_rate": tune.loguniform(1e-6, 1e-4),
    "weight_decay": tune.uniform(0.0, 0.3),
    "per_device_train_batch_size": tune.choice([8, 16, 32]),
}

# Configure and run the hyperparameter search
# Replace `local_dir` with `storage_path`
analysis = tune.run(
    tune.with_parameters(train_model),
    config=search_space,
    num_samples=10,  # Number of trials
    resources_per_trial={"cpu": 4, "gpu": 1},  # Adjust resources
    storage_path="./ray_results",  # Directory to store results # Changed to storage_path
)

# Print the best hyperparameters
print("Best hyperparameters:", analysis.best_config)'''

'# hyperparameter_tuning.py\n!pip install ray\nfrom transformers import Trainer, TrainingArguments, EarlyStoppingCallback\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom ray import tune\nimport os\n\n\n# Function to define the model and training configuration\ndef train_model(config, checkpoint_dir=None):\n    model = AutoModelForCausalLM.from_pretrained(model_choice)\n    # ... (your code to adjust dropout rates if needed)\n\n    training_args = TrainingArguments(\n        output_dir=os.path.join(checkpoint_dir, "results"),\n        eval_strategy="epoch",\n        learning_rate=config["learning_rate"],\n        weight_decay=config["weight_decay"],\n        per_device_train_batch_size=config["per_device_train_batch_size"],\n        # ... (other training arguments)\n    )\n\n    trainer = Trainer(\n        model=model,\n        args=training_args,\n        train_dataset=train_dataset,\n        eval_dataset=valid_dataset,\n        compute_metrics=compute_metrics,\n   

In [23]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 2.2179362773895264, 'eval_runtime': 6.4898, 'eval_samples_per_second': 47.459, 'eval_steps_per_second': 6.009, 'epoch': 10.0}


In [24]:
'''def inference(input_text):
  if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
    input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
  # input_text = "Provide straightforward, literal translations for this sarcastic comment: I just absolutely LOVE how I've got to work outside for the next 3 days in the heatwave."

  input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
  #output_ids = model.generate(input_ids)
  #output_ids = model.generate(input_ids, max_length=128)
  output_ids = model.generate(input_ids, max_length=32, early_stopping=True, num_beams=5)  # Adjust parameters
  decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  return decoded_output'''

In [32]:
def inference(input_text):
    # Clear prompt for the model
    prompt = "Rewrite this sarcastic comment as a factual statement: "
    if not input_text.startswith(prompt):
        input_text = prompt + input_text

    # Tokenize with padding and attention mask
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate with increased diversity
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=64,
        early_stopping=True,
        num_beams=5,              # Beam search for best results
        temperature=0.7,           # Lower temperature for diversity
        top_k=50,                  # Top-k sampling
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output, removing the instruction if it is repeated
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if decoded_output.startswith(prompt):
        decoded_output = decoded_output[len(prompt):].strip()

    return decoded_output


In [26]:
# Either during training or before calling inference
for i in range(5):
    print(f"{i}, \nsrc: {test_df['Sarcastic'].iloc[i]} \ntranslation: {inference(test_df['Sarcastic'].iloc[i])} \nground_truth: {test_df['Translation'].iloc[i]}")



0, 
src: Isn't this exciting? Navigating through misinformation! 
translation: <pad> Misinformation is frustrating. 
ground_truth: Misinformation is frustrating.
1, 
src: How great! An office party that feels more like a chore! 
translation: <pad> The office party is not enjoyable. 
ground_truth: Office parties can feel obligatory.
2, 
src: Men will literally fake an injury on the football field instead of going to therapy 
translation: <pad> Men will literally fake an injury on the football field instead of going to therapy. It's embarrassing. 
ground_truth: Men should go to therapy!!!
3, 
src: Oh great! Each meeting seems to teach me nothing new. 
translation: <pad> Meetings often lack valuable information. 
ground_truth: Meetings are often uninformative.
4, 
src: Wow, your pajamas at the meeting are really making a statement! 
translation: <pad> Your pajamas at the meeting are uncomfortable. 
ground_truth: Wearing pajamas at a meeting is inappropriate.


In [27]:
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f"{i}, \nsrc: {src} \ntranslation: {inference(src)} \nground_truth: {truth}")


4, 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
translation: <pad> Ending snacks before dinner is not a good choice for your health. 
ground_truth: Eating snacks before dinner is not a good decision for your health.


### Evaluate on GPT-4o-mini pairs

In [28]:
df2 = load_data("GPT-4o-mini")
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [29]:
gpt_pairs = SarcasmTranslationDataset(tokenize_data(df2))

In [30]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

In [31]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 2.2179362773895264, 'eval_runtime': 6.5752, 'eval_samples_per_second': 46.843, 'eval_steps_per_second': 5.931, 'epoch': 10.0}
