In [1]:
!nvidia-smi

Thu May 22 08:21:31 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:10.0 Off |                    0 |
| N/A   31C    P0             47W /  400W |   62537MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [3]:
!pip install -q transformers datasets evaluate accelerate bert-score

In [4]:
from dotenv import load_dotenv
from huggingface_hub import login

load_dotenv() 
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
    logger.info("Successfully authenticated with Hugging Face.")
else:
    logger.warning("HF_TOKEN not found in environment variables. Some operations may fail.")

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
INFO:__main__:Successfully authenticated with Hugging Face.


In [5]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
import evaluate
from tqdm import tqdm

CONFIG = {
    "max_length": 128,
    "source_lang": "eng_Latn",
    "target_lang": "khm_Khmr",
    "batch_size": 32,
    "model_name": "lyfeyvutha/nllb_350M_en_km_v10",
    "tokenizer_name": "facebook/nllb-200-distilled-600M"
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


INFO:datasets:PyTorch version 2.4.1+cu121 available.


In [6]:
alt_dataset = load_dataset("mutiyama/alt")
eval_data = []
for split_name in alt_dataset.keys():
    for item in alt_dataset[split_name]:
        translations = item.get("translation", {})
        if "en" in translations and "khm" in translations:
            eval_data.append({
                'eng': translations["en"],
                'khm': translations["khm"]
            })
eval_dataset = Dataset.from_list(eval_data)

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["tokenizer_name"],
    src_lang=CONFIG["source_lang"],
    tgt_lang=CONFIG["target_lang"]
)
model.to(device)
khm_token_id = tokenizer.convert_tokens_to_ids(CONFIG["target_lang"])
generation_config = GenerationConfig(
    max_length=CONFIG["max_length"],
    forced_bos_token_id=khm_token_id
)



In [8]:
english_sentences = [item['eng'] for item in eval_data]
khmer_references = [item['khm'] for item in eval_data]

In [9]:
def translate_batch(sentences, model, tokenizer, generation_config, device, batch_size=32):
    """Translate sentences in batches for better performance."""
    all_translations = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Translating"):
        batch = sentences[i:i+batch_size]
        valid_indices, valid_sentences = [], []
        for idx, sentence in enumerate(batch):
            if sentence and sentence.strip():
                valid_indices.append(idx)
                valid_sentences.append(sentence)
        batch_translations = [""] * len(batch)
        if valid_sentences:
            try:
                inputs = tokenizer(valid_sentences, return_tensors="pt", padding=True, truncation=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    output_ids = model.generate(**inputs, generation_config=generation_config)
                for idx, output_id in enumerate(output_ids):
                    translation = tokenizer.decode(output_id[1:], skip_special_tokens=True)
                    batch_translations[valid_indices[idx]] = translation
            except Exception as e:
                logger.error(f"Error in batch {i//batch_size}: {e}")
        all_translations.extend(batch_translations)
    return all_translations


In [10]:
logger.info("Starting batch translation...")
predictions = translate_batch(
    english_sentences,
    model,
    tokenizer,
    generation_config,
    device,
    batch_size=CONFIG["batch_size"]
)

INFO:__main__:Starting batch translation...
Translating: 100%|██████████| 629/629 [04:32<00:00,  2.31it/s]


In [11]:
khmer_references_clean = [ref if ref is not None else "" for ref in khmer_references]
predictions_clean = [pred if pred is not None else "" for pred in predictions]

In [12]:
logger.info("Calculating chrF score...")
chrf_metric = evaluate.load("chrf")
chrf_result = chrf_metric.compute(
    predictions=predictions_clean,
    references=khmer_references_clean
)
print(f"chrF score: {chrf_result['score']:.4f}")

INFO:__main__:Calculating chrF score...


chrF score: 38.8338


In [16]:
logger.info("Calculating BERTScore...")
from bert_score import score
P, R, F1 = score(
    predictions_clean,
    khmer_references_clean,
    lang="other",
    model_type="bert-base-multilingual-cased"
)
print(f"BERTScore F1: {F1.mean().item():.4f}")


INFO:__main__:Calculating BERTScore...


BERTScore F1: 0.8608


