# Standard LLM Metrics

This notebook covers the implementation of standard LLM metrics for various purposes. 
These applications and associated metrics are:
- Translation using [IWSLT17 English-French Dataset](https://huggingface.co/datasets/IWSLT/iwslt2017) - BLUE
- Summarization using [CNN/Daily Mail Dataset](https://huggingface.co/datasets/abisee/cnn_dailymail) - ROGUE
- Sentiment analysis using [IMDB Movie Reviews Dataset](https://huggingface.co/datasets/stanfordnlp/imdb) - Standard classification metrics

# 0. Libraries, contanst and support functions

In [2]:
import re

import pandas as pd
import torch

from tqdm import tqdm

from torch.utils.data import DataLoader
from datasets import load_dataset
from datasets import get_dataset_config_names
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MarianTokenizer, MarianMTModel
from transformers import T5Tokenizer

# 1. Translation tasks

In [3]:
# Load translation dataset
translation_full_dataset = load_dataset('iwslt2017', "iwslt2017-en-fr")

# Divide between train, validation and test
ds_trans_train = translation_full_dataset['train']
ds_trans_val = translation_full_dataset['validation']
ds_trans_test = translation_full_dataset['test']

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
# Clean text from HTML tags and extra whitespaces
def clean_text(text: str) -> str:
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Remove from all elements of the dataset
def clean_dataset(example):
    """
    Applies the clean_text function to both the source and target texts.

    Args:
        example (dict): A single example from the dataset.

    Returns:
        dict: The example with cleaned texts.
    """
    example['translation']['en'] = clean_text(example['translation']['en'])
    example['translation']['fr'] = clean_text(example['translation']['fr'])
    return example


# Apply to dataset
ds_trans_train = ds_trans_train.map(clean_dataset)
ds_trans_val = ds_trans_val.map(clean_dataset)
ds_trans_test = ds_trans_test.map(clean_dataset)

In [5]:
# For each sample, apply a filter that removes sentences too short or too long
def filter_samples(sample):
    source = sample['translation']['en']
    target = sample['translation']['fr']
    # Define length thresholds
    min_length = 5
    max_length = 128
    # Compute lengths
    source_len = len(source.split())
    target_len = len(target.split())
    # Filter condition
    return min_length <= source_len <= max_length and min_length <= target_len <= max_length

ds_trans_train = ds_trans_train.filter(filter_samples)
ds_val_train = ds_trans_train.filter(filter_samples)
ds_trans_train = ds_trans_train.filter(filter_samples)


Filter:   0%|          | 0/216089 [00:00<?, ? examples/s]

In [7]:
model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Load tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    src_texts = [ex['en'] for ex in examples['translation']]
    tgt_texts = [ex['fr'] for ex in examples['translation']]

    # Tokenize inputs
    model_inputs = tokenizer(src_texts, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing to tokenize all input words
tokenized_dataset = ds_trans_test.map(preprocess_function, batched=True, remove_columns=['translation'])

Map:   0%|          | 0/8597 [00:00<?, ? examples/s]



In [9]:
# Convert Hugging Face dataset to PyTorch tensors
tokenized_dataset.set_format(type='torch')

# Create DataLoader
test_loader = DataLoader(tokenized_dataset, batch_size=16)

In [None]:
# Apply all samples to the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

translations = []
references = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Generate translations
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
        
        # Decode translations and references
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
        
        translations.extend(decoded_preds)
        references.extend(decoded_labels)


In [14]:
from datasets import load_metric

metric = load_metric('sacrebleu')

# Prepare references in the expected format
references = [[ref] for ref in references]

# Compute BLEU score
bleu = metric.compute(predictions=translations, references=references)
print(f"BLEU score: {bleu['score']:.2f}")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BLEU score: 35.63


# 1.1. Fine-tune the model for better translation

# TBD