# Deep Learning and Applied AI (DLAI) Project: **Fine-Tuning a Large Language Model (LLM) for Italian-to-Neapolitan Dialect Translation**

# Part III: Performance Evaluation

### Author: Aur Marina Iuliana, 1809715

# 1. Import Libraries

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score 
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import random
import math
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset, load_dataset
from tqdm import tqdm
import wandb

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(torch.version.cuda)

# Seed for reproducibility
seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

True
12.4


In [4]:
# Initialize wandb for experiment tracking
wandb.init(
    project = "nap-dialect-finetuning",
    name = "nap-dialect-inference",
    reinit = True)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mmarinaaur[0m ([33mmarinaaur-sapienza[0m). Use [1m`wandb login --relogin`[0m to force relogin


# 2. Modelling

## 2.1 Vanilla Gemma-2-2B-it Model

In [4]:
model_id = "google/gemma-2-2b-it"

vanilla_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map = "auto",
    torch_dtype = torch.bfloat16,
    trust_remote_code = True
    )

vanilla_tokenizer = AutoTokenizer.from_pretrained(model_id)
vanilla_tokenizer.padding_side = 'left'
vanilla_tokenizer.pad_token = vanilla_tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.28it/s]


## 2.2 Fine-Tuned Gemma-2-2B-it Model

In [5]:
fine_tuned_model_path = 'nap-dialect-gemma-2-2b-it-finetuning/checkpoint-176'
fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    fine_tuned_model_path,
    device_map = "auto",
    torch_dtype=torch.float16, low_cpu_mem_usage = True)

fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
fine_tuned_tokenizer.padding_side = 'left'
fine_tuned_tokenizer.pad_token = fine_tuned_tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.49it/s]


In [7]:
fine_tuned_model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.2, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2304, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2304, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
    

# 3. Import Data

In [8]:
dataset = load_dataset("efederici/mt_nap_it", split = ['train[:80%]', 'train[80%:90%]', 'train[90%:]'])

dataset = {
    "train": dataset[0],
    "eval": dataset[1],
    "test": dataset[2]
}

dataset

{'train': Dataset({
     features: ['url', 'napoletano', 'italiano'],
     num_rows: 11320
 }),
 'eval': Dataset({
     features: ['url', 'napoletano', 'italiano'],
     num_rows: 1415
 }),
 'test': Dataset({
     features: ['url', 'napoletano', 'italiano'],
     num_rows: 1415
 })}

In [9]:
def generate_prompts_evaluation(entry, tokenizer):
  user_prompt = '''Translate the provided text from Italian language to Neapolitan dialect. Return only the text translated in Neapolitan, without any additional details.\nItalian Text: {nap_text}
  '''

  chat = [
      {"role": "user", "content": user_prompt.format(nap_text = entry['italiano'])},
  ]

  prompt = tokenizer.apply_chat_template(chat, tokenize = False, add_generation_prompt = True)

  return prompt

In [None]:
test_prompts = [generate_prompts_evaluation(entry, fine_tuned_tokenizer) for entry in dataset['test']]
print(test_prompts[0])

<bos><start_of_turn>user
Translate the provided text from Italian language to Neapolitan dialect. Return only the text translated in Neapolitan, without any additional details.
Italian Text: E' quasi ottobre, mi sembra inverno<end_of_turn>
<start_of_turn>model



In [11]:
test_dataset = [{"text": prompt} for prompt in test_prompts]
test_dataset = Dataset.from_list(test_dataset)
test_dataset

Dataset({
    features: ['text'],
    num_rows: 1415
})

# 4. Performance Evaluation

## 4.1 Perplexity

In [12]:
def compute_ppl(model, encodings, max_length = 2048, stride = 512, device = "cuda"):
  '''
  Compute the perplexity of a given model using the provided encodings.
  soure_code: https://huggingface.co/docs/transformers/en/perplexity
  '''
  if max_length is None:
      max_length = model.config.max_length

  seq_len = encodings.input_ids.size(1)
  nlls = []
  prev_end_loc = 0

  for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # May differ from stride on the last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100  # Only consider trg_len labels

    with torch.no_grad():
      outputs = model(input_ids, labels = target_ids)

      # Loss is calculated using CrossEntropyLoss which averages over valid labels
      neg_log_likelihood = outputs.loss

      nlls.append(neg_log_likelihood)
      prev_end_loc = end_loc
      if end_loc == seq_len:
        break

  ppl = torch.exp(torch.stack(nlls).mean())
  return ppl

In [13]:
# Prepare the test data
vanilla_encodings = vanilla_tokenizer("\n\n".join(dataset['test']['napoletano']), return_tensors = "pt", truncation = True, max_length = 2048).to(device)
fine_tuned_encodings = fine_tuned_tokenizer("\n\n".join(dataset['test']['napoletano']), return_tensors = "pt", truncation = True, max_length = 2048).to(device)

In [14]:
fine_tuned_ppl = compute_ppl(fine_tuned_model, fine_tuned_encodings)

wandb.log({"Fine-tuned Model Perplexity": fine_tuned_ppl})
print(f"Fine-tuned model perplexity: {fine_tuned_ppl}")

  0%|          | 0/4 [00:00<?, ?it/s]

Fine-tuned model perplexity: 68.37694549560547





In [15]:
vanilla_ppl = compute_ppl(vanilla_model, vanilla_encodings)

wandb.log({"Vanilla Model Perplexity": vanilla_ppl})
print(f"Vanilla model perplexity: {vanilla_ppl}")

  0%|          | 0/4 [00:00<?, ?it/s]


Vanilla model perplexity: 180.22314453125


## 4.2 BLEU, ROUGE and BERT Scores

In [14]:
def compute_scores(generated_responses, reference_texts):
  rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer = True)
  results = []

  P, R, F1 = bert_score(generated_responses, reference_texts, lang = "it")

  for i, (gen_text, ref_text) in enumerate(zip(generated_responses, reference_texts)):

    # Compute BLEU Score
    bleu_score = sentence_bleu([ref_text.split()], gen_text.split())

    # Compute ROUGE Score
    rouge_scores = rouge.score(ref_text, gen_text)
    rougeL = rouge_scores['rougeL'].fmeasure

    # BERTScore
    bert_f1 = F1[i].item()

    # Save the results
    results.append({
        "bleu": bleu_score,
        "rl": rougeL,
        "bertscore": bert_f1
        })

  return results

In [17]:
def performance_evaluation(model, tokenizer, examples, expected_responses, model_type, batch_size = 64):
    # Set the pad token ID to match the tokenizer's configuration
    model.generation_config.pad_token_id = tokenizer.pad_token_id

    
    # Initialize wandb table to log info for each entry
    table = wandb.Table(columns = ["Italian Text", "Neapolitan Expected Translation", "Neapolitan Generated Translation", 
                                   "BLEU score", "ROUGE-L score", "BERT F1 score"])

    # Initialize accumulators for evaluation metrics
    total_bleu = 0
    total_rougeL = 0
    total_bert_f1 = 0

    # Get the total number of examples
    num_examples = len(examples)

    # Calculate the total number of batches
    num_batches = math.ceil(num_examples / batch_size)

    # Loop through each batch
    for batch_idx in range(num_batches):
        # Slice the examples and expected responses for the current batch
        batch_examples = examples['text'][batch_idx * batch_size:(batch_idx + 1) * batch_size]
        batch_expected_responses = expected_responses[batch_idx * batch_size:(batch_idx + 1) * batch_size]

        # Extract the prompts from the current batch
        prompts = [example for example in batch_examples]

        # Tokenize the batch prompts
        model_inputs = tokenizer(prompts, padding=True, truncation = True, max_length = 1024, return_tensors = "pt").to(device)

        # Generate responses for the batch
        outputs = model.generate(**model_inputs, max_length = 1024)
        responses = [tokenizer.decode(output, skip_special_tokens = True) for output in outputs]


        # Compute scores for each example in the batch
        for prompt, response, expected_response in zip(prompts, responses, batch_expected_responses):
            results = compute_scores([response.splitlines()[-1]], [expected_response])

            # Add row to the wandb table
            italian_text = prompt.splitlines()[2].replace('<end_of_turn>', '').replace('Italian Text:', '')
            expected_translation = expected_response
            generated_translation = response.splitlines()[-1]
            table.add_data(
                italian_text, expected_translation, generated_translation,
                results[0]['bleu'], results[0]['rl'], results[0]['bertscore']
                )

            # Accumulate scores for each metric
            total_bleu += results[0]['bleu']
            total_rougeL += results[0]['rl']
            total_bert_f1 += results[0]['bertscore']

        # Log progress every N batches
        if (batch_idx + 1) % 2 == 0:
            print(f"Processed {batch_idx + 1} batches ({len(batch_examples)} items per batch).")

    # Calculate average metrics
    avg_bleu = total_bleu / num_examples
    avg_rougeL = total_rougeL / num_examples
    avg_bert_f1 = total_bert_f1 / num_examples

    # Log the results to Weights & Biases
    wandb.log({
        f"Average BLEU {model_type}": avg_bleu,
        f"Average ROUGE-L {model_type}": avg_rougeL,
        f"Average BERT F1 {model_type}": avg_bert_f1
    })

    wandb.log({f"{model_type} Translation Table": table})

    # Return a dictionary of the average scores
    return {
        'average_bleu': avg_bleu,
        'average_rougeL': avg_rougeL,
        'average_bert_f1': avg_bert_f1
    }


In [18]:
# Compute scores using Fine-Tuned Gemma Model
examples = test_dataset
expected_responses = dataset['test']['napoletano']

fine_tuned_results = performance_evaluation(fine_tuned_model, fine_tuned_tokenizer, examples, expected_responses, "Fine-Tuned gemma-2-2b-it Model")
fine_tuned_results

Processed 2 batches (64 items per batch).
Processed 4 batches (64 items per batch).
Processed 6 batches (64 items per batch).
Processed 8 batches (64 items per batch).
Processed 10 batches (64 items per batch).
Processed 12 batches (64 items per batch).
Processed 14 batches (64 items per batch).
Processed 16 batches (64 items per batch).
Processed 18 batches (64 items per batch).
Processed 20 batches (64 items per batch).
Processed 22 batches (64 items per batch).


{'average_bleu': 0.2303301895460473,
 'average_rougeL': 0.6569207256284708,
 'average_bert_f1': 0.8765983931167387}

In [19]:
# Compute scores using Vanilla Gemma Model
vanilla_results = performance_evaluation(vanilla_model, vanilla_tokenizer, examples, expected_responses, "Vanilla gemma-2-2b-it Model")
vanilla_results

Processed 2 batches (64 items per batch).
Processed 4 batches (64 items per batch).
Processed 6 batches (64 items per batch).
Processed 8 batches (64 items per batch).
Processed 10 batches (64 items per batch).
Processed 12 batches (64 items per batch).
Processed 14 batches (64 items per batch).




Processed 16 batches (64 items per batch).
Processed 18 batches (64 items per batch).
Processed 20 batches (64 items per batch).
Processed 22 batches (64 items per batch).


{'average_bleu': 0.015180432633587338,
 'average_rougeL': 0.283961063951785,
 'average_bert_f1': 0.7474061743951939}

In [20]:
wandb.finish()

0,1
Average BERT F1 Fine-Tuned gemma-2-2b-it Model,▁
Average BERT F1 Vanilla gemma-2-2b-it Model,▁
Average BLEU Fine-Tuned gemma-2-2b-it Model,▁
Average BLEU Vanilla gemma-2-2b-it Model,▁
Average ROUGE-L Fine-Tuned gemma-2-2b-it Model,▁
Average ROUGE-L Vanilla gemma-2-2b-it Model,▁
Fine-tuned Model Perplexity,▁
Vanilla Model Perplexity,▁

0,1
Average BERT F1 Fine-Tuned gemma-2-2b-it Model,0.8766
Average BERT F1 Vanilla gemma-2-2b-it Model,0.74741
Average BLEU Fine-Tuned gemma-2-2b-it Model,0.23033
Average BLEU Vanilla gemma-2-2b-it Model,0.01518
Average ROUGE-L Fine-Tuned gemma-2-2b-it Model,0.65692
Average ROUGE-L Vanilla gemma-2-2b-it Model,0.28396
Fine-tuned Model Perplexity,68.37695
Vanilla Model Perplexity,180.22314


In [21]:
# Build the evaluation Dataframe
fine_tuned_results['perplexity'] = fine_tuned_ppl.item()
vanilla_results['perplexity'] = vanilla_ppl.item()

evaluation_df = pd.DataFrame([fine_tuned_results, vanilla_results], index=['Fine-Tuned gemma-2-2b-it Model', 'Vanilla gemma-2-2b-it Model'])
evaluation_df = evaluation_df.round(3)
evaluation_df

Unnamed: 0,average_bleu,average_rougeL,average_bert_f1,perplexity
Fine-Tuned gemma-2-2b-it Model,0.23,0.657,0.877,68.377
Vanilla gemma-2-2b-it Model,0.015,0.284,0.747,180.223


# 5. Qualitative Assessment



In [15]:
def generate_response(model, tokenizer, examples, expected_responses):
  model.generation_config.pad_token_id = tokenizer.pad_token_id

  for example, expected_response in zip(examples, expected_responses):
    # Genrate Response
    prompt = example['text']
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_length = 2048)
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Compute_metrics
    results = compute_scores([response], [expected_response])

    print(f'{prompt.splitlines()[2].replace('<end_of_turn>', '')}\n')
    print(f'Generated Response: \n{response.splitlines()[-1]}\n')
    print(f'Expected Response: \n{expected_response}\n')
    print(f'BLEU Score: {results[0]["bleu"]}')
    print(f'ROUGE-L Score: {results[0]["rl"]}')
    print(f'BERTScore: {results[0]["bertscore"]}\n')
    print('------------------------------------------------\n')

In [16]:
examples_2 = test_dataset.select(range(0, 3))
expected_responses_2 = [output for output in dataset['test']['napoletano'][0:3]]

print("--- Qualitative Assessment using Fine-Tuned gemma-2-2b-it Model --- \n ")
generate_response(fine_tuned_model, fine_tuned_tokenizer, examples_2, expected_responses_2)

--- Qualitative Assessment using Fine-Tuned gemma-2-2b-it Model --- 
 
Italian Text: E' quasi ottobre, mi sembra inverno

Generated Response: 
E' quasi ottobre, mme pare 'nvern'

Expected Response: 
E' uttombre quase, vierno mme pare

BLEU Score: 3.2495743136156002e-155
ROUGE-L Score: 0.13953488372093023
BERTScore: 0.6048598885536194

------------------------------------------------

Italian Text: "Qua si deve dormire"

Generated Response: 
"Cca s'ha dda durmí"

Expected Response: 
"Cca s'ha da durmì"

BLEU Score: 3.1118147503396668e-155
ROUGE-L Score: 0.20512820512820512
BERTScore: 0.6758249998092651

------------------------------------------------

Italian Text: E un'esplosione ci ha colpiti là

Generated Response: 
E 'n'esplosione ce ha cugliuta lla

Expected Response: 
E 'na botta c'ha cugliuto lla

BLEU Score: 8.784466509829401e-232
ROUGE-L Score: 0.13043478260869565
BERTScore: 0.6556634306907654

------------------------------------------------



In [17]:
print("--- Qualitative Assessment using Vanilla gemma-2-2b-it Model --- \n")
generate_response(vanilla_model, vanilla_tokenizer, examples_2, expected_responses_2)

--- Qualitative Assessment using Vanilla gemma-2-2b-it Model --- 



Italian Text: E' quasi ottobre, mi sembra inverno

Generated Response: 
È quasi 'ottobre, mi s'è 'n'inverno. 

Expected Response: 
E' uttombre quase, vierno mme pare

BLEU Score: 7.386826398032373e-232
ROUGE-L Score: 0.046511627906976744
BERTScore: 0.5608353614807129

------------------------------------------------

Italian Text: "Qua si deve dormire"

Generated Response: 
"Qua si deve dormire" 

Expected Response: 
"Cca s'ha da durmì"

BLEU Score: 0
ROUGE-L Score: 0.0
BERTScore: 0.601401686668396

------------------------------------------------

Italian Text: E un'esplosione ci ha colpiti là

Generated Response: 
E 'n' esplosione ci 'a colpità là 

Expected Response: 
E 'na botta c'ha cugliuto lla

BLEU Score: 7.337741777064293e-232
ROUGE-L Score: 0.08695652173913043
BERTScore: 0.6001957058906555

------------------------------------------------



**NOTE:** More translation examples can be found on **Weights & Biases (Wandb):** [Nap-Dialect-Finetuning-Inference-Wandb](https://wandb.ai/marinaaur-sapienza/nap-dialect-finetuning/runs/2kped5jt?nw=nwusermarinaaur).