# Standard LLM Metrics

This notebook covers the implementation of standard LLM metrics for various purposes. 
These applications and associated metrics are:
- Translation using [IWSLT17 English-French Dataset](https://huggingface.co/datasets/IWSLT/iwslt2017) - BLUE
- Summarization using [CNN/Daily Mail Dataset](https://huggingface.co/datasets/abisee/cnn_dailymail) - ROGUE
- Sentiment analysis using [IMDB Movie Reviews Dataset](https://huggingface.co/datasets/stanfordnlp/imdb) - Standard classification metrics

# 0. Libraries, contanst and support functions

In [1]:
!pip install torch
!pip install datasets transformers sentencepiece
!pip install tqdm
!pip install sacrebleu



In [2]:
import re

import pandas as pd
import torch

from tqdm import tqdm

from torch.utils.data import DataLoader
from datasets import load_metric
from datasets import load_dataset
from datasets import get_dataset_config_names
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MarianTokenizer, MarianMTModel

# 1. Translation tasks

For this task, we will evaluate its performance before fine-tuning and after fine-tuning, to evaluate the difference.

In [3]:
# Load translation dataset
translation_full_dataset = load_dataset('iwslt2017', "iwslt2017-en-fr")

# Divide between train, validation and test
ds_trans_train = translation_full_dataset['train']
ds_trans_val = translation_full_dataset['validation']
ds_trans_test = translation_full_dataset['test']

In [4]:
# Clean text from HTML tags and extra whitespaces
def clean_text(text: str) -> str:
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Remove from all elements of the dataset
def clean_dataset(example):
    """
    Applies the clean_text function to both the source and target texts.

    Args:
        example (dict): A single example from the dataset.

    Returns:
        dict: The example with cleaned texts.
    """
    example['translation']['en'] = clean_text(example['translation']['en'])
    example['translation']['fr'] = clean_text(example['translation']['fr'])
    return example


# Apply to dataset
ds_trans_train = ds_trans_train.map(clean_dataset)
ds_trans_val = ds_trans_val.map(clean_dataset)
ds_trans_test = ds_trans_test.map(clean_dataset)

In [5]:
# For each sample, apply a filter that removes sentences too short or too long
def filter_samples(sample):
    source = sample['translation']['en']
    target = sample['translation']['fr']
    # Define length thresholds
    min_length = 5
    max_length = 128
    # Compute lengths
    source_len = len(source.split())
    target_len = len(target.split())
    # Filter condition
    return min_length <= source_len <= max_length and min_length <= target_len <= max_length

ds_trans_train = ds_trans_train.filter(filter_samples)
ds_trans_val = ds_trans_val.filter(filter_samples)
ds_trans_test = ds_trans_test.filter(filter_samples)


In [6]:
model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Load tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)



In [7]:
def preprocess_function(examples):
    inputs = [ex['en'] for ex in examples['translation']]
    targets = [ex['fr'] for ex in examples['translation']]

    # Tokenize inputs & targets
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing to tokenize all input words
tokenized_train_dataset = ds_trans_train.map(
    preprocess_function,
    batched=True,
    remove_columns=['translation']
)
tokenized_val_dataset = ds_trans_val.map(
    preprocess_function,
    batched=True,
    remove_columns=['translation']
)
tokenized_test_dataset = ds_trans_test.map(
    preprocess_function, 
    batched=True, 
    remove_columns=['translation']
)

In [8]:
# Convert Hugging Face dataset to PyTorch tensors
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [9]:
# Create DataLoader
batch_size = 16
train_dataloader = DataLoader(tokenized_train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(tokenized_val_dataset, shuffle=False, batch_size=batch_size)
test_dataloader = DataLoader(tokenized_test_dataset, shuffle=False, batch_size=batch_size)

## 1.1. Evaluate before fine-tune

In [10]:
# Apply all samples to the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

translations = []
references = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Generate translations
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        
        # Decode translations and references
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
        
        translations.extend(decoded_preds)
        references.extend(decoded_labels)


100%|██████████| 497/497 [08:32<00:00,  1.03s/it]


In [11]:
metric = load_metric('sacrebleu')

# Prepare references in the expected format
references_flatten = [[ref] for ref in references]

# Compute BLEU score
bleu = metric.compute(predictions=translations, references=references_flatten)
print(f"BLEU score: {bleu['score']:.2f}")

  metric = load_metric('sacrebleu')


BLEU score: 42.85


## 1.2. Fine-tune the model for better translation

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
    fp16=torch.cuda.is_available(),  # Enable mixed precision if possible
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()

2024-10-08 18:00:02.883282: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch,Training Loss,Validation Loss
1,0.2622,0.335222


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59513]], 'forced_eos_token_id': 0}


TrainOutput(global_step=6753, training_loss=0.3434211433189773, metrics={'train_runtime': 3331.0023, 'train_samples_per_second': 64.872, 'train_steps_per_second': 2.027, 'total_flos': 7325063778926592.0, 'train_loss': 0.3434211433189773, 'epoch': 1.0})

In [17]:
# Evaluate on the test set
# test_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

# Generate predictions
predictions = trainer.predict(test_dataset=tokenized_test_dataset)

# Decode predictions and references
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# Compute BLEU score
metric = load_metric('sacrebleu')
references_formatted = [[ref] for ref in decoded_labels]
bleu = metric.compute(predictions=decoded_preds, references=references_formatted)
print(f"BLEU score: {bleu['score']:.2f}")

BLEU score: 41.49
