# Fine-tune LLMs to do Sarcasm interpretations

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/SarcasmNLP')

## Load Model

### GPT-2 small

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')




### Google FLAN-T5-base

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### T5-base

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the T5-Base model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")


## Load Data

In [5]:
import pandas as pd

df = pd.read_csv('iSarcasm_pairs.tsv', sep='\t')


In [6]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [None]:
train_df['Input'] = "Provide straightforward, literal translations for this sarcastic comment: " + train_df['Sarcastic']
valid_df['Input'] = "Provide straightforward, literal translations for this sarcastic comment: " + valid_df['Sarcastic']
test_df['Input'] = "Provide straightforward, literal translations for this sarcastic comment: " + test_df['Sarcastic']


In [None]:

def tokenize_data(df):
    inputs = tokenizer(df['Input'].tolist(), padding=True, truncation=True, return_tensors="pt")
    targets = tokenizer(df['Translation'].tolist(), padding=True, truncation=True, return_tensors="pt")

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': targets['input_ids'],
    }

# Tokenize train, validation, and test datasets
train_encodings = tokenize_data(train_df)
valid_encodings = tokenize_data(valid_df)
test_encodings = tokenize_data(test_df)

In [None]:
import torch

class SarcasmTranslationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = SarcasmTranslationDataset(train_encodings)
valid_dataset = SarcasmTranslationDataset(valid_encodings)
test_dataset = SarcasmTranslationDataset(test_encodings)

## Training

In [None]:
model.name_or_path

'google/flan-t5-base'

In [None]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # evaluation strategy to adopt during training
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    num_train_epochs=10,             # total number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)




In [None]:
trainer.train()

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)

# Save the model
model.save_pretrained('./results/my_model')
tokenizer.save_pretrained('./results/my_model')

# Make predictions
predictions = trainer.predict(test_dataset)
decoded_predictions = tokenizer.batch_decode(predictions.predictions.tolist(), skip_special_tokens=True)
print(decoded_predictions)


In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('./results/my_model')

# Load the model
model = T5ForConditionalGeneration.from_pretrained('./results/my_model')

# Now you can use the model and tokenizer for inference


In [14]:

def inference(input_text):

  if not input_text.startswith("Provide straightforward, literal translations for this sarcastic comment: "):
    input_text = "Provide straightforward, literal translations for this sarcastic comment: " + input_text
  # input_text = "Provide straightforward, literal translations for this sarcastic comment: I just absolutely LOVE how I've got to work outside for the next 3 days in the heatwave."

  # Tokenize the input
  input_ids = tokenizer(input_text, return_tensors='pt').input_ids

  # Generate output
  output_ids = model.generate(input_ids)

  # Decode the output
  decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  return decoded_output


In [16]:
for i in range(10):
    print(f"{i}, \nsrc: {test_df['Sarcastic'].iloc[i]} \ntranslation: {inference(test_df['Sarcastic'].iloc[i])} \nground_truth: {test_df['Translation'].iloc[i]}")



0, 
src: @Mythical So worried about him. But if you're looking to save him, based on the topography I'd say it's somewhere on the east coast. Perhaps the Carolinas? 
translation: I'm not worried about him. 
ground_truth: I suppose I would have simply stated that we all know he is not kidnapped, but it seems like he's just in the woods in North Carolina. 
1, 
src: Damn, imagine being vaxxed and then getting a cold and then losing your taste and smell.

Oh wait. 
translation: I hate being vaxxed and then getting a cold and then losing my taste and 
ground_truth: To make it non-sarcastic I could have said, "I hate having a cold and then losing my taste and smell."
2, 
src: if you see me crying in the self-service car wash in my rosati's uniform, no you didn't ❤️ 
translation: I'm not crying in the self-service car wash in my rosati' 
ground_truth: No.
3, 
src: I miss walking up 3 flights of stairs for class and having to catch my breath in the bathroom 😩 
translation: I miss walking up 3 

In [22]:
src = "Look at you, finishing all your snacks before dinner. What a healthy choice!"
truth = "Eating snacks before dinner is not a good decision for your health."
print(f"{i}, \nsrc: {src} \ntranslation: {inference(src)} \nground_truth: {truth}")


9, 
src: Look at you, finishing all your snacks before dinner. What a healthy choice! 
translation: I would say that eating snacks before dinner is not healthy. 
ground_truth: Eating snacks before dinner is not a good decision for your health.
