In [None]:
pip install deep-translator sacrebleu datasets #install if needed

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from dat

In [None]:
from google.colab import drive
drive.mount('/content/drive') # mount drive

Mounted at /content/drive


In [None]:
import os
from deep_translator import GoogleTranslator
import sacrebleu
from datasets import Dataset
import xml.etree.ElementTree as ET
# initialize dataset as in NLLB
base_path = "/content/drive/MyDrive/Colab Notebooks/CompLing"
en_devtest_path = os.path.join(base_path, "eng_Latn.devtest")
gn_devtest_path = os.path.join(base_path, "grn_Latn.devtest")
# get just the flores data
def load_flores_for_bleu(english_path: str, guarani_path: str) -> Dataset:
    with open(english_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(guarani_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    data = [{"translation": {"en": en, "gn": gn}} for en, gn in zip(en_lines, gn_lines)]
    return Dataset.from_list(data)

def extract_translation_columns(dataset):
    return dataset.map(lambda x: {
        "en": x["translation"]["en"],
        "gn": x["translation"]["gn"]
    })
# normalize function to match google translate's output
def normalize_glottal_stop(text):
    return (
        text
        .replace("\\'", "ʼ")
        .replace("'", "ʼ")
        .replace("’", "ʼ")
    )

def normalize_dataset(dataset):
    return dataset.map(lambda x: {
        "en": x["en"],
        "gn": normalize_glottal_stop(x["gn"])
    })
# get dataset for translation
test_dataset = load_flores_for_bleu(en_devtest_path, gn_devtest_path)
test_dataset = extract_translation_columns(test_dataset)
test_dataset = test_dataset.select(range(100))
test_dataset = normalize_dataset(test_dataset)
source_sentences = test_dataset["en"]
reference_sentences = [normalize_glottal_stop(t) for t in test_dataset["gn"]]

translator = GoogleTranslator(source='en', target='gn')
translated_sentences = []
# translation loop for evaluation
for idx, sentence in enumerate(source_sentences, 1):
    try:
        translation = translator.translate(sentence)
        translated_sentences.append(normalize_glottal_stop(translation))
        print(f"Translated {idx}/{len(source_sentences)}", end='\r')
    except Exception as e:
        print(f"\nError translating: {sentence} -> {e}")
        translated_sentences.append("")

# Eval
chrf_score = sacrebleu.corpus_chrf(translated_sentences, [reference_sentences], word_order=2)
print(f"\nChrF++ score for Google Translate: {chrf_score.score:.2f}")

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Translated 100/100
ChrF++ score for Google Translate: 32.24
