# Fine-tune LLMs to do Sarcasm interpretations

In [1]:
!pip install nltk comet-ml emoji unbabel-comet datasets evaluate rouge_score

Collecting comet-ml
  Downloading comet_ml-3.47.2-py3-none-any.whl.metadata (3.9 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting unbabel-comet
  Downloading unbabel_comet-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting everett<3.2.0,>=1.0.1 (from everett[ini]<3.2.0,>=1.0.1->comet-ml)
  Downloading everett-3.1.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting python-box<7.0.0 (from comet-ml)
  Downloading python_box-6.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)
Collecting semantic-version>=2.8.0 (from comet-ml)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting simplejson (from co

In [2]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set the target directory path
target_dir = '/content/drive/MyDrive/SarcasmNLP'

# Create the directory if it doesn't exist
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

# Change the working directory to the target directory
os.chdir(target_dir)

print(f"Current working directory: {os.getcwd()}")

Mounted at /content/drive
Current working directory: /content/drive/MyDrive/SarcasmNLP


In [3]:
# model_choice = 'gpt2'
model_choice = 'flan-t5-base'
# model_choice = 't5-base'
classifier_model_choice = 'bert-base-uncased'


In [4]:
mode = 'train'
# mode = 'evaluate'

In [5]:
#dataset_ = 'iSarcasm'
# dataset_ = 'GPT-4o-mini'
dataset_ = 'combined_train_df'

## Load Model

### Classfication Model: bert-base-uncased

In [6]:
# initialize tokenizer and model for Sarcasm Detection
from transformers import BertTokenizer, BertForSequenceClassification

if mode == 'train':
  classifier_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  classifier_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
else:
  classifier_tokenizer = BertTokenizer.from_pretrained(f'./results/{classifier_model_choice}/my_model')
  classifier_model = BertForSequenceClassification.from_pretrained(f'./results/{classifier_model_choice}/my_model')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Interpretation Models:

### GPT-2 small

In [7]:
# initialize tokenizer and model for Sarcasm Interpretation using GPT-2
if model_choice == 'gpt2':
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
  if mode == 'train':
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token
  else:
    tokenizer = GPT2Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = GPT2LMHeadModel.from_pretrained(f'./results/{model_choice}/my_model')


### Google FLAN-T5-base

In [8]:
# initialize tokenizer and model for Sarcasm Interpretation using flan-t5-base
if model_choice == 'flan-t5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
    model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### T5-base

In [9]:
# initialize tokenizer and model for Sarcasm Interpretation using t5-base
if model_choice == 't5-base':
  from transformers import T5Tokenizer, T5ForConditionalGeneration
  if mode == 'train':
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
  else:
    tokenizer = T5Tokenizer.from_pretrained(f'./results/{model_choice}/my_model')
    model = T5ForConditionalGeneration.from_pretrained(f'./results/{model_choice}/my_model')


## Load Data

In [10]:
# loading combined_df dataset = isarcasm + gpt_pairs
import pandas as pd
def load_data():
  dataset = pd.read_csv('combined_df.tsv', sep='\t')
  evaluation_dataset = pd.read_csv('iSarcasm_pairs_test.tsv', sep='\t')
  dataset['Translation'] = dataset['Translation'].fillna('')
  evaluation_dataset['Translation'] = evaluation_dataset['Translation'].fillna('')
  return dataset, evaluation_dataset

df, df_eval = load_data()

In [11]:
df.head()

Unnamed: 0,Sarcastic,Translation,IsSarcastic
0,You know the wolves match is boring when you'r...,"This match isn't too interesting, we are using...",1
1,How lovely! The same old complaints brought up...,Hearing the same complaints repeatedly is frus...,1
2,"Wow, you must be proud of your talent for poin...",Focusing on negatives can be frustrating.,1
3,It's soooo great that I've taken pain medicati...,"I took pain medicine for my back, but it still...",1
4,"Yes, because what we really need is more burea...",I believe we do not need additional bureaucrat...,1


### Classification model:

### Intitialization

In [12]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [13]:
# train,test and validation dataset splitting
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [14]:
# Function to convert emojis to text, handling float values
import emoji

def convert_emojis(text):
    # Check if text is a float (potentially NaN) and convert to string
    if isinstance(text, float):
        text = str(text)
    return emoji.demojize(text, delimiters=(" ", " "))

# Apply emoji conversion to both input (sarcastic) and output (literal) text
train_df['Sarcastic'] = train_df['Sarcastic'].apply(convert_emojis)
valid_df['Sarcastic'] = valid_df['Sarcastic'].apply(convert_emojis)
test_df['Sarcastic'] = test_df['Sarcastic'].apply(convert_emojis)



In [15]:
# source text encoding with selecetd classifier tokenizer
def encode_texts(texts, targets=None, tokenizer=None):
    if targets is None:  # For single input (sarcasm detection)
        return classifier_tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )

In [16]:
import torch
from torch.utils.data import Dataset

# Custom dataset class for sarcasm classification
class SarcasmClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = encode_texts(texts.astype(str))
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.texts['input_ids'][idx],
            'attention_mask': self.texts['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Create datasets
train_dataset = SarcasmClassificationDataset(train_df['Sarcastic'], train_df['IsSarcastic'])
valid_dataset = SarcasmClassificationDataset(valid_df['Sarcastic'], valid_df['IsSarcastic'])
test_dataset = SarcasmClassificationDataset(test_df['Sarcastic'], test_df['IsSarcastic'])

### Training of Sarcasm Detection model

In [17]:
# metric computation for sarcasm detection
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    # Extract predictions and labels
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    # Return as dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [18]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Training args for classification/detection model
training_args = TrainingArguments(
    output_dir=f'./results/{classifier_model_choice}',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir='./logs',
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy="epoch"
)

# Initialize Trainer for sarcasm classification
trainer = Trainer(
    model=classifier_model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    compute_metrics=compute_metrics       # Metrics function
)

In [19]:
if mode == 'train':
  # Train the sarcasm classifier
  trainer.train()
  # Save the model
  classifier_model.save_pretrained(f'./results/{classifier_model_choice}/my_model')
  classifier_tokenizer.save_pretrained(f'./results/{classifier_model_choice}/my_model')

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.000161,1.0,1.0,1.0,1.0
2,0.011300,6.9e-05,1.0,1.0,1.0,1.0
3,0.011300,4.7e-05,1.0,1.0,1.0,1.0
4,0.000100,4.1e-05,1.0,1.0,1.0,1.0


In [20]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 4.112016540602781e-05, 'eval_accuracy': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_f1': 1.0, 'eval_runtime': 1.4613, 'eval_samples_per_second': 210.768, 'eval_steps_per_second': 26.688, 'epoch': 4.0}


### Interpretation Model:

### Initialization

In [21]:
# Data loading for Interpretation model, only using the sarcastic statements
df, df_eval = load_data()
df = df[df['IsSarcastic'] == 1]
print(df.shape)

(3080, 3)


In [22]:
# data splitting
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [23]:
# prefix added for prompt
add_prefix = lambda x: "Provide straightforward, literal translations for this sarcastic comment: " + str(x)

train_df['Input'] = train_df['Sarcastic'].apply(add_prefix)
valid_df['Input'] = valid_df['Sarcastic'].apply(add_prefix)
test_df['Input'] = test_df['Sarcastic'].apply(add_prefix)


In [24]:
# handling null Translation values
train_df['Translation'] = train_df['Translation'].fillna("")
valid_df['Translation'] = valid_df['Translation'].fillna("")
test_df['Translation'] = test_df['Translation'].fillna("")

# text tokenization with selected model tokenizer
def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [25]:
import torch
# custom Dataset initialization
class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

In [26]:
import torch
# custom Dataset initialization
class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, sources):
        self.encodings = encodings
        self.sources = sources

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['source'] = self.sources[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings, list(train_df['Sarcastic']))
valid_dataset = SarcasmTranslationDataset(valid_encodings, list(valid_df['Sarcastic']))
test_dataset = SarcasmTranslationDataset(test_encodings, list(test_df['Sarcastic']))

## Prepare Metrics

For colab, need to install additional packages (already in conda environment.yml)

In [27]:
import evaluate

# Load the metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
comet = evaluate.load("comet")  # Ensure COMET is installed and properly configured
chrf = evaluate.load("chrf")  # ChrF metric



Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.53k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [28]:
# metrics computation for interpretation evaluation
def compute_metrics(decoded_preds, decoded_labels, sources):

    # BLEU
    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)

    # ChrF
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)

    # ROUGE
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # COMET
    comet_result = comet.compute(predictions=decoded_preds, references=decoded_labels, sources=sources)



    # Combine the results, including all ROUGE scores
    metrics = {
        "bleu": bleu_result["bleu"],
        "chrf": chrf_result["score"],
        "comet": comet_result.get("mean_score", None),
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "rougeLsum": rouge_result.get("rougeLsum", None),
    }

    return metrics

## Training

In [29]:
model = model.to(device) # to device

In [30]:
model.name_or_path # selected model check

'google/flan-t5-base'

In [31]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir=f'./results/{model_choice}',
    eval_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    save_steps=10000,
    save_total_limit=1,              # keep only the most recent checkpoint
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="none",                # Disable wandb logging
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics=compute_metrics
)


In [32]:
# start model training
if mode == 'train':
  trainer.train()
  # Save the model
  model.save_pretrained(f'./results/{model_choice}/my_model')
  tokenizer.save_pretrained(f'./results/{model_choice}/my_model')

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.3186,0.415667
2,0.2065,0.361094
3,0.185,0.350661
4,0.1962,0.344392
5,0.2013,0.341356
6,0.1975,0.339193
7,0.1767,0.337216
8,0.185,0.336877
9,0.1806,0.33607
10,0.1555,0.335828


## Evaluation

In [33]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.3358277380466461, 'eval_runtime': 5.5365, 'eval_samples_per_second': 55.631, 'eval_steps_per_second': 7.044, 'epoch': 10.0}


In [34]:
predictions, labels, _ = trainer.predict(test_dataset)

In [35]:
logits = torch.tensor(predictions[0])

# Select the token with the highest probability
predicted_token_ids = torch.argmax(logits, dim=-1)

# Decode the predicted tokens
decoded_preds = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

In [36]:
sources = [test_dataset[i]['source'] for i in range(len(test_dataset))]

In [37]:
# metrics computation 1
metrics = compute_metrics(decoded_preds, decoded_labels, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.15973849328759945, 'chrf': 42.41183283121616, 'comet': 0.5354569897829712, 'rouge1': 0.4544658871743198, 'rouge2': 0.2233280347820068, 'rougeL': 0.44964050583907456, 'rougeLsum': 0.4488403977407464}


In [38]:
# metrics computation 2
metrics = compute_metrics(sources, decoded_labels, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.051321322716817636, 'chrf': 31.35269284664227, 'comet': 0.6380274261940609, 'rouge1': 0.23577408126711463, 'rouge2': 0.09583890840611678, 'rougeL': 0.21291309602735717, 'rougeLsum': 0.21317846784371539}


In [39]:
# metrics computation 3
metrics = compute_metrics(decoded_preds, sources, sources)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


{'bleu': 0.04821502807041542, 'chrf': 25.87370026699391, 'comet': 0.44903836254175605, 'rouge1': 0.253547541613084, 'rouge2': 0.09075672172198318, 'rougeL': 0.23157331408239534, 'rougeLsum': 0.23219516840103444}


In [40]:
def classify_sarcasm(text):
    inputs = classifier_tokenizer(text, return_tensors="pt").to(device)  # Move inputs to the same device as the model
    # Move the model to the same device as the inputs
    classifier_model.to(device)
    outputs = classifier_model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction == 1

def generate_interpretation(input_text):
    if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
      input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    output_ids = model.generate(input_ids)
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return decoded_output


In [41]:
# Inference on a few test examples
evaluation_texts = df_eval.sample(n=5, random_state=42)
count = 0
for index, row in evaluation_texts.iterrows():
    is_sarcastic = classify_sarcasm(row['Sarcastic'])
    print(f"Evaluation {count}:")
    print(f"Original: {row['Sarcastic']}")
    print(f"Sarcastic: {'Yes' if is_sarcastic else 'No'}")
    if row['IsSarcastic'] == 1:
      interpretation = generate_interpretation(row['Sarcastic'])
      print(f"Interpretation: {interpretation}")
      print(f"Ground Truth: {row['Translation']}")
    print("-" * 20)
    count += 1

Evaluation 0:
Original: Schadenfreude x
Sarcastic: Yes




Interpretation: Schadenfreude is a bad feeling.
Ground Truth: After many years it is brilliant to laugh at germany 
--------------------
Evaluation 1:
Original: So, a country that is trillions in debt, with numbers approaching 13 million on surgical waiting lists and hurtling towards facism regime, is talking about rescuing people from a facist regime?
Sarcastic: Yes
Interpretation: I do not believe that a country that is trillions in debt, with numbers approaching 13
Ground Truth: This country is not a "free" country anymore.
--------------------
Evaluation 2:
Original: If anyone wants to know how my nights going I tried making a private story on Snapchat and instead made a group chat... I hate my life
Sarcastic: Yes
Interpretation: I hate my life.
Ground Truth: this was the most embarrassing moment of my life.
--------------------
Evaluation 3:
Original: don't make me rewatch panic ü§®
Sarcastic: Yes
Interpretation: I do not want to rewatch the movie again.
Ground Truth: ‚ÄúI'm goin

In [42]:
#test example
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f" \nsrc: {src} \nisSarcastic: {classify_sarcasm(src)} \ntranslation: {generate_interpretation(src)} \nground_truth: {truth}")


 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
isSarcastic: True 
translation: You often end up eating too much food before dinner. 
ground_truth: Eating snacks before dinner is not a good decision for your health.


In [43]:
# inference method with different prompt
def inference(input_text):
    # Clear prompt for the model
    prompt = "Rewrite this sarcastic comment as a factual statement: "
    if not input_text.startswith(prompt):
        input_text = prompt + input_text

    # Tokenize with padding and attention mask
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True).to(device)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Generate with increased diversity
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=128,
        early_stopping=True,
        num_beams=5,              # Beam search for best results
        temperature=0.7,           # Lower temperature for diversity
        top_k=50,                  # Top-k sampling
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output, removing the instruction if it is repeated
    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if decoded_output.startswith(prompt):
        decoded_output = decoded_output[len(prompt):].strip()

    return decoded_output

In [58]:
# Inference on a few test examples with different prompt
evaluation_texts = df_eval.sample(n=10, random_state=42)
count = 0
for index, row in evaluation_texts.iterrows():
    is_sarcastic = classify_sarcasm(row['Sarcastic'])
    print(f"Evaluation {count}:")
    print(f"Original: {row['Sarcastic']}")
    print(f"Sarcastic: {'Yes' if is_sarcastic else 'No'}")
    if row['IsSarcastic'] == 1:
      interpretation = inference(row['Sarcastic'])
      print(f"Interpretation: {interpretation}")
      print(f"Ground Truth: {row['Translation']}")
    print("-" * 20)
    count += 1

Evaluation 0:
Original: Schadenfreude x
Sarcastic: Yes




Interpretation: Schadenfreude is a negative emotion.
Ground Truth: After many years it is brilliant to laugh at germany 
--------------------
Evaluation 1:
Original: So, a country that is trillions in debt, with numbers approaching 13 million on surgical waiting lists and hurtling towards facism regime, is talking about rescuing people from a facist regime?
Sarcastic: Yes
Interpretation: A country that is trillions in debt, with numbers approaching 13 million on surgical waiting lists and hurtling towards a facist regime is talking about rescuing people from a facist regime.
Ground Truth: This country is not a "free" country anymore.
--------------------
Evaluation 2:
Original: If anyone wants to know how my nights going I tried making a private story on Snapchat and instead made a group chat... I hate my life
Sarcastic: Yes
Interpretation: I hate my life.
Ground Truth: this was the most embarrassing moment of my life.
--------------------
Evaluation 3:
Original: don't make me rewatch 

### Evaluate on T5-base Model

In [45]:
model = model.to(device)

In [46]:
model.name_or_path

'google/flan-t5-base'

In [47]:
# load data for t5-base
df2, eval_data = load_data()
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [48]:
# tokenization and custom dataset creation
train_encodings = tokenize_data(df2)
gpt_pairs = SarcasmTranslationDataset(train_encodings, list(df2['Sarcastic']))

In [49]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs
    # compute_metrics=compute_metrics,
)

In [50]:
# evaluate the model for selected  model
eval_results = trainer_temp.evaluate()
print(eval_results)

{'eval_loss': 0.16150501370429993, 'eval_model_preparation_time': 0.0059, 'eval_runtime': 77.1681, 'eval_samples_per_second': 39.913, 'eval_steps_per_second': 4.989}


### Evaluate on GPT-4o-mini pairs

In [51]:
model = model.to(device)

In [52]:
model.name_or_path

'google/flan-t5-base'

In [53]:
# load data for gp2
df2, eval_data = load_data()
df2['Input'] = df2['Sarcastic'].apply(add_prefix)

In [54]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class SarcasmTranslationDataset(Dataset):
    def __init__(self, encodings, labels, tokenizer, max_length=128):
        self.encodings = encodings
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        # Tokenize and process the labels similar to the input
        item['labels'] = item['input_ids'] # Assign input_ids as labels

        return item

    def __len__(self):
        return len(self.labels)

In [55]:
# tokenization and custom dataset creation
train_encodings = tokenize_data(df2)
gpt_pairs = SarcasmTranslationDataset(train_encodings, list(df2['Sarcastic']), tokenizer)

In [56]:
trainer_temp = Trainer(
    model=model,
    args=training_args,
    eval_dataset=gpt_pairs,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

  trainer_temp = Trainer(


In [57]:
# evaluate the model for selected  model
eval_results = trainer_temp.evaluate()
print(eval_results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.4988267123699188, 'eval_model_preparation_time': 0.006, 'eval_runtime': 77.6221, 'eval_samples_per_second': 39.679, 'eval_steps_per_second': 4.96}
