In [None]:
import pandas as pd

# Load the cleaned Spanish -> English dataset
df = pd.read_csv('../datasets/cleaned_es_en_dataset.csv', delimiter=';', encoding='utf-8')
inputs = df['input_text'].tolist()
references = df['reference_translation'].tolist()
references_lower = [s.lower() for s in references]

# Load the translated text dataset
tr_df = pd.read_csv('../translated-datasets/mbart-large-finetuned-opus-es-en-translated_es_en_dataset.csv', delimiter=';', encoding='utf-8')
translations = tr_df['translated_text'].tolist()
translations_lower = [s.lower() for s in translations]

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Function to calculate BLEU score for each pair of sentences
def calculate_bleu(reference, translation):
    reference_tokens = reference.split()
    translation_tokens = translation.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu([reference_tokens], translation_tokens, smoothing_function=smoothie)

# Calculate BLEU scores
bleu_scores = [calculate_bleu(ref, trans) for ref, trans in zip(references_lower, translations_lower)]

# Average BLEU score (0.09)
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f'Average BLEU score: {average_bleu:.4f}')
print('===================================================')

# Print individual translations and their BLEU scores
for input_text, ref, trans, bleu in zip(inputs, references_lower, translations_lower, bleu_scores):
    print(f'Input: {input_text}')
    print(f'Reference: {ref}')
    print(f'Translation: {trans}')
    print(f'BLEU score: {bleu:.4f}\n')