# Fine-tune LLMs to do Sarcasm interpretations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Context_Aware')

Mounted at /content/drive


In [2]:
model_choice = 'gpt2'
#model_choice = 'flan-t5-base'
# model_choice = 't5-base'


In [3]:
mode = 'train'
#mode = 'evaluate'

In [4]:
#dataset_ = 'iSarcasm'
# dataset_ = 'GPT-4o-mini'
dataset_ = 'combined_train_df'

## Load Model

### GPT-2 small

In [5]:
if model_choice == 'gpt2':
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
  if mode == 'train':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
  else:
    tokenizer = GPT2Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = GPT2LMHeadModel.from_pretrained(f'./results/{model_choice}/my_model')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

### Google FLAN-T5-base

In [6]:
if model_choice == 'flan-t5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


### T5-base

In [7]:
if model_choice == 't5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


## Load Data

In [8]:
from sklearn.model_selection import train_test_split

import pandas as pd
def load_data(dataset):
  if dataset == 'iSarcasm':
    return pd.read_csv('iSarcasm_pairs.tsv', sep='\t')
  if dataset == 'GPT-4o-mini':
    return pd.read_csv('GPT_pairs.tsv', sep='\t')

  else:
    data = pd.read_csv('combined_df.tsv', sep='\t')
    data = data.drop(data.columns[0], axis=1)  # Drop the first column
    return data

df = load_data(dataset_)

In [9]:

df.head()

Unnamed: 0,Sarcastic,Translation
0,You know the wolves match is boring when you'r...,"This match isn't too interesting, we are using..."
1,How lovely! The same old complaints brought up...,Hearing the same complaints repeatedly is frus...
2,"Wow, you must be proud of your talent for poin...",Focusing on negatives can be frustrating.
3,It's soooo great that I've taken pain medicati...,"I took pain medicine for my back, but it still..."
4,"Yes, because what we really need is more burea...",I believe we do not need additional bureaucrat...


In [10]:
from sklearn.model_selection import train_test_split





train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [11]:
add_prefix = lambda x: "Provide straightforward, literal translations for this sarcastic comment: " + x

train_df['Input'] = train_df['Sarcastic'].apply(add_prefix)
valid_df['Input'] = valid_df['Sarcastic'].apply(add_prefix)
test_df['Input'] = test_df['Sarcastic'].apply(add_prefix)


In [12]:
'''tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)'''


tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(df):
    inputs = tokenizer(
        df['Input'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    targets = tokenizer(
        df['Translation'].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Set padding tokens in targets to -100 to ignore them in loss calculation
    targets['input_ids'][targets['input_ids'] == tokenizer.pad_token_id] = -100

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [13]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

## Prepare Metrics

For colab, need to install additional packages (already in conda environment.yml)

In [14]:
!pip install datasets evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8

In [15]:
!pip install rouge_score unbabel-comet #restart maybe needed

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting sacrebleu<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [16]:
import evaluate

# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
comet = evaluate.load("comet")  # Ensure COMET is installed and properly configured
chrf = evaluate.load("chrf")  # ChrF metric



Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [17]:

def compute_metrics(pred):
    # Get predictions and labels
    predictions = pred.predictions[0]
    labels = pred.label_ids
    # if isinstance(predictions, list) and isinstance(predictions[0], list):
    #     predictions = [pred[0] for pred in predictions]

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # ChrF
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    #COMET
    comet_result = comet.compute(predictions=decoded_preds, references=decoded_labels)



    # Combine the results, including all ROUGE scores
    metrics = {
        "bleu": bleu_result["bleu"],
        "chrf": chrf_result["score"],
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "rougeLsum": rouge_result.get("rougeLsum", None),
        "comet": comet_result.get("score", None),
    }

    return metrics

## Training

In [18]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [19]:
model.name_or_path

'gpt2'

In [24]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback


'''device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up Training Arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    eval_strategy="epoch",
    learning_rate=3e-5,                # Lower learning rate for stability
    save_steps=5000,
    save_total_limit=2,                # Keep only recent 2 checkpoints
    per_device_train_batch_size=32,  # Reduced batch size for training
    per_device_eval_batch_size=32,  # Reduced batch size for evaluation
    num_train_epochs=10,                # Reduced epochs to avoid overfitting
    weight_decay=0.05,                 # Increase weight decay for regularization
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",
    lr_scheduler_type="linear",        # Linear decay scheduler
    warmup_steps=500,                  # Warmup for initial stabilization
    load_best_model_at_end=True,  # Keep best model (early stopping benefit)
    save_strategy="epoch",
    # Gradient Accumulation
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    # Evaluation strategy
    eval_accumulation_steps=2       # Accumulate eval results over 2 steps
)

# Early Stopping Callback
# Stop training if validation loss doesn’t improve after a set number of checks
#early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

# Create Trainer instance with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    #compute_metrics=compute_metrics,
    #callbacks=[early_stopping]         # Add early stopping callback
)'''

# Set training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    eval_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate= 9e-5, #1e-4,#2e-5,              # learning rate
    save_steps=10000,
    save_total_limit=1,              # keep only the most recent checkpoint
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay= 0.01, #0.05,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100, #10,
    report_to = 'none',
    lr_scheduler_type= "linear", #"cosine",   #scheduler
    #warmup_steps=500,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics=compute_metrics,  # Add compute_metrics if you have it defined
)




In [25]:
if mode == 'train':
  trainer.train()
  # Save the model
  model.save_pretrained(f'./results/{model_choice}/my_model')
  tokenizer.save_pretrained(f'./results/{model_choice}/my_model')

Epoch,Training Loss,Validation Loss
1,6.6015,6.601955
2,6.2447,6.650765
3,6.1452,6.728287
4,5.96,6.859212
5,5.8123,7.043456
6,5.573,7.294777
7,5.4002,7.459167
8,5.3094,7.593894
9,5.2056,7.666606
10,5.19,7.701035


In [None]:
'''# hyperparameter_tuning.py
!pip install ray
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModelForCausalLM, AutoTokenizer
from ray import tune
import os


# Function to define the model and training configuration
def train_model(config, checkpoint_dir=None):
    model = AutoModelForCausalLM.from_pretrained(model_choice)
    # ... (your code to adjust dropout rates if needed)

    training_args = TrainingArguments(
        output_dir=os.path.join(checkpoint_dir, "results"),
        eval_strategy="epoch",
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
        per_device_train_batch_size=config["per_device_train_batch_size"],
        # ... (other training arguments)
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
    )

    # Train and evaluate the model
    train_results = trainer.train(resume_from_checkpoint=checkpoint_dir)
    eval_results = trainer.evaluate()

    # Report the evaluation results to Ray Tune
    tune.report(eval_loss=eval_results["eval_loss"])


# Define the hyperparameter search space
search_space = {
    "learning_rate": tune.loguniform(1e-6, 1e-4),
    "weight_decay": tune.uniform(0.0, 0.3),
    "per_device_train_batch_size": tune.choice([8, 16, 32]),
}

# Configure and run the hyperparameter search
# Replace `local_dir` with `storage_path`
analysis = tune.run(
    tune.with_parameters(train_model),
    config=search_space,
    num_samples=10,  # Number of trials
    resources_per_trial={"cpu": 4, "gpu": 1},  # Adjust resources
    storage_path="./ray_results",  # Directory to store results # Changed to storage_path
)

# Print the best hyperparameters
print("Best hyperparameters:", analysis.best_config)'''

In [26]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 7.701035499572754, 'eval_runtime': 3.1645, 'eval_samples_per_second': 97.331, 'eval_steps_per_second': 12.324, 'epoch': 10.0}


In [33]:
def inference(input_text):
  if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
    input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
  # input_text = "Provide straightforward, literal translations for this sarcastic comment: I just absolutely LOVE how I've got to work outside for the next 3 days in the heatwave."

  input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
  #output_ids = model.generate(input_ids)
  #output_ids = model.generate(input_ids, max_length=128)
  output_ids = model.generate(input_ids, max_length=32, early_stopping=True, num_beams=5)  # Adjust parameters
  decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  return decoded_output

In [29]:
def inference(input_text):
    # Clear prompt for the model
    prompt = "Rewrite this sarcastic comment as a factual statement: "
    if not input_text.startswith(prompt):
        input_text = prompt + input_text

    # Tokenize with padding and attention mask
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate with increased diversity
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=64,
        early_stopping=True,
        num_beams=5,              # Beam search for best results
        temperature=0.7,           # Lower temperature for diversity
        top_k=50,                  # Top-k sampling
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output, removing the instruction if it is repeated
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if decoded_output.startswith(prompt):
        decoded_output = decoded_output[len(prompt):].strip()

    return decoded_output


In [34]:
# Either during training or before calling inference
for i in range(5):
    print(f"{i}, \nsrc: {test_df['Sarcastic'].iloc[i]} \ntranslation: {inference(test_df['Sarcastic'].iloc[i])} \nground_truth: {test_df['Translation'].iloc[i]}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0, 
src: Isn't this exciting? Navigating through misinformation! 
translation: Provide straightforward, literal translations for this sarcastic comment: Isn't this exciting? Navigating through misinformation! up. down. roads I. I... 
ground_truth: Misinformation is frustrating.
1, 
src: How great! An office party that feels more like a chore! 
translation: Provide straightforward, literal translations for this sarcastic comment: How great! An office party that feels more like a chore!........ 
ground_truth: Office parties can feel obligatory.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2, 
src: Men will literally fake an injury on the football field instead of going to therapy 
translation: Provide straightforward, literal translations for this sarcastic comment: Men will literally fake an injury on the football field instead of going to therapy."..... 
ground_truth: Men should go to therapy!!!
3, 
src: Oh great! Each meeting seems to teach me nothing new. 
translation: Provide straightforward, literal translations for this sarcastic comment: Oh great! Each meeting seems to teach me nothing new.......... 
ground_truth: Meetings are often uninformative.
4, 
src: Wow, your pajamas at the meeting are really making a statement! 
translation: Provide straightforward, literal translations for this sarcastic comment: Wow, your pajamas at the meeting are really making a statement!  ?  
ground_truth: Wearing pajamas at a meeting is inappropriate.


In [35]:
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f"{i}, \nsrc: {src} \ntranslation: {inference(src)} \nground_truth: {truth}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


4, 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
translation: Provide straightforward, literal translations for this sarcastic comment: Look at you, finishing all your snacks before dinner. What a healthy choice!..... 
ground_truth: Eating snacks before dinner is not a good decision for your health.


### Evaluate on GPT-4o-mini pairs

In [None]:
df2 = load_data("GPT-4o-mini")
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [None]:
gpt_pairs = SarcasmTranslationDataset(tokenize_data(df2))

In [None]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

In [None]:
eval_results = trainer.evaluate()
print(eval_results)