In [1]:
!pip install --no-cache-dir transformers==4.51.3 datasets==3.5.0 peft==0.15.2 evaluate==0.4.3 sacrebleu==2.5.1
!pip install --no-cache-dir accelerate==1.6.0 -U
!pip install --no-cache-dir tensorboard==2.18.0 matplotlib==3.10.1

Collecting datasets==3.5.0
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.3
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu==2.5.1
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.5.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.5.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets==3.5.0)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets==3.5.0)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker (from sacrebleu==2.5.1)
  Downloading

# Imports

In [2]:
import os
import numpy as np
import torch
from datasets import load_dataset

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
import evaluate
import matplotlib.pyplot as plt
import pandas as pd
from transformers.modeling_utils import PreTrainedModel
import random

# Set device


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Define globals


In [4]:
SRC_LANG = "tuk_Latn"  # Turkmen in Latin script
TGT_LANG = "eng_Latn"  # English in Latin script
MODEL_NAME = "facebook/nllb-200-distilled-600M"
TUNNED_MODEL_NAME = "./final-nllb-turkmen-english-lora"
OUTPUT_DIR = "./nllb-200-turkmen-english-lora"
TEST_LIMIT = 20


# Load model and tokenizer


In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

# Load the dataset from Hugging Face


In [6]:
dataset = load_dataset("XSkills/turkmen_english_s500")
print(dataset)

# Check the first example to understand the structure
print("Sample data:", dataset["train"][0])

README.md:   0%|          | 0.00/7.21k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/101k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/19.2k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/17.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/495 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/62 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/62 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 62
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 62
    })
})
Sample data: {'translation': {'en': 'Philosophical sonorous poems of great Magtymguly, passed through centuries and reached our days, his appeals to humanism and love to native land, his Fatherland, his wise precepts enriched the spiritual life of whole humanity.', 'tk': 'Magtymguly Pyragynyñ asyrlar aşyp, biziñ döwrümize gelip ýeten müñ dürli öwüşginli şygyrlary, ene topragy, ata Watany ýürekden söýmäge we ynsanperwerlige çagyryşlary, parasatly sargytlary bütin adamzadyñ ruhy gymmatlygyna öwrüldi.'}}


# Preprocess the datasets

In [7]:
def preprocess_function(examples):
    inputs = [example["tk"] for example in examples["translation"]]
    targets = [example["en"] for example in examples["translation"]]

    # Set source and target languages
    tokenizer.src_lang = SRC_LANG
    tokenizer.tgt_lang = TGT_LANG

    # Tokenize inputs and targets in a single call
    model_inputs = tokenizer(
        inputs,
        text_target=targets,  # Use text_target parameter
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Replace pad token id with -100 in labels so it's ignored in loss calculation
    labels = model_inputs["labels"].clone()
    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_example]
        for labels_example in labels
    ]

    return model_inputs

# Preprocess the datasets
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)


Map:   0%|          | 0/495 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

# Configure LoRA


In [8]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)


# Custom model preparation function that doesn't rely on PEFT-specific functions
print("Preparing model for LoRA fine-tuning...")
# Make most parameters non-trainable
for param in model.parameters():
    param.requires_grad = False

# Make only attention layers trainable
for name, module in model.named_modules():
    # if any(target_name in name for target_name in ["q_proj", "v_proj", "k_proj", "o_proj"]):
    if any(target_name in name for target_name in ["q_proj", "v_proj"]):
        for param_name, param in module.named_parameters():
            param.requires_grad = True

print("Applying LoRA configuration...")
# Apply LoRA
model = get_peft_model(model, lora_config)

# Check trainable parameters
trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
    all_params += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(f"Trainable params: {trainable_params} | All params: {all_params} | Trainable%: {100 * trainable_params / all_params:.2f}%")

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="max_length",
    max_length=128,
    return_tensors="pt"
)


Preparing model for LoRA fine-tuning...
Applying LoRA configuration...
Trainable params: 2359296 | All params: 617433088 | Trainable%: 0.38%


# Evaluation metrics

In [9]:
# Load multiple evaluation metrics
bleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")
ter_metric = evaluate.load("ter")

def postprocess_text(preds, labels):
    """Helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    """Compute comprehensive translation metrics"""
    preds, labels = eval_preds

    # Decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 with pad_token_id before decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Postprocess
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # BLEU expects list of lists for references
    references = [[label] for label in decoded_labels]

    # Calculate metrics
    bleu_result = bleu_metric.compute(predictions=decoded_preds, references=references)
    chrf_result = chrf_metric.compute(predictions=decoded_preds, references=decoded_labels)
    ter_result = ter_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # Return combined results
    results = {
        "bleu": bleu_result["score"],
        "chrf": chrf_result["score"],
        "ter": ter_result["score"]
    }

    # Save some example translations for qualitative review
    if not hasattr(compute_metrics, "examples_saved"):
        n_examples = min(10, len(decoded_preds))
        examples_df = pd.DataFrame({
            "prediction": decoded_preds[:n_examples],
            "reference": decoded_labels[:n_examples]
        })
        examples_df.to_csv("translation_examples.csv", index=False)
        compute_metrics.examples_saved = True

    return results


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

# Training

In [11]:
# Training arguments with improved settings
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    weight_decay=0.005,
    save_total_limit=3,
    learning_rate=1e-5,
    num_train_epochs=5,
    lr_scheduler_type="cosine",
    predict_with_generate=True,
    fp16=True if torch.cuda.is_available() else False,
    logging_dir="./logs",
    logging_steps=50,
    eval_steps=50,
    save_steps=100,
    eval_accumulation_steps=2,
    report_to="tensorboard",
    warmup_ratio=0.1,
    metric_for_best_model="eval_bleu",  # Use BLEU for model selection
    greater_is_better=True,
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train model
print("Starting training...")
trainer.train()

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


Step,Training Loss
50,2.0983
100,2.0223
150,2.0692
200,2.0208
250,2.0339
300,2.0342
350,1.9331
400,1.993
450,2.0064
500,1.9721


TrainOutput(global_step=620, training_loss=2.011208958779612, metrics={'train_runtime': 91.0354, 'train_samples_per_second': 27.187, 'train_steps_per_second': 6.811, 'total_flos': 674932536115200.0, 'train_loss': 2.011208958779612, 'epoch': 5.0})

# Evaluation

## Evaluate on the test set


In [12]:
test_results = trainer.evaluate(tokenized_datasets["test"], metric_key_prefix="test")
print(f"Test results: {test_results}")

Test results: {'test_loss': 1.7573912143707275, 'test_bleu': 0.06753808912680619, 'test_chrf': 1.3068834590475353, 'test_ter': 101.03866128101558, 'test_runtime': 12.5003, 'test_samples_per_second': 4.96, 'test_steps_per_second': 0.64, 'epoch': 5.0}


## Save the fine-tuned model


In [13]:
trainer.save_model(TUNNED_MODEL_NAME)

## Define an improved translation function for inference


In [14]:

# --- Helper Functions ---

# Mapping from simple codes (like in dataset) to NLLB codes
LANG_CODE_MAP = {
    "en": "eng_Latn",
    "tk": "tuk_Latn",
}

def translate(text, model, tokenizer, src_lang_code, tgt_lang_code):
    """
    Translate text from source language to target language using NLLB format.
    """
    try:
        tokenizer.src_lang = src_lang_code
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

        # Move inputs to the same device the model is on
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Get the target language token ID
        tgt_lang_token_id = tokenizer.convert_tokens_to_ids(tgt_lang_code)
        if tgt_lang_token_id is None:
            print(f"Warning: Target language code '{tgt_lang_code}' not found in tokenizer. Using default generation.")
            tgt_lang_token_id = tokenizer.eos_token_id

        # Generate translation
        with torch.no_grad():
            translated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=tgt_lang_token_id,
                max_length=128,
                num_beams=5,
                early_stopping=True
            )

        translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        return translation
    except Exception as e:
        print(f"Error during translation of '{text[:50]}...': {e}")
        return "[Translation Error]"


# --- Evaluation Functions --- (evaluate_model_performance remains largely the same)
def evaluate_model_performance(ft_model, orig_model, tokenizer, test_data, src_key, tgt_key, max_examples=50):
    """
    Evaluate and compare fine-tuned vs original model on the test set
    for a specific translation direction (src_key -> tgt_key).
    """
    print(f"\n--- Evaluating Translation: {src_key} -> {tgt_key} ---")

    # --- Add Checks for Model Types ---
    if not isinstance(ft_model, PreTrainedModel):
        print(f"ERROR: Fine-tuned model provided to evaluate_model_performance is not a valid model. Type: {type(ft_model)}")
        return None, None
    if not isinstance(orig_model, PreTrainedModel):
        print(f"ERROR: Original model provided to evaluate_model_performance is not a valid model. Type: {type(orig_model)}")
        return None, None
    # --- End Checks ---


    if src_key not in LANG_CODE_MAP or tgt_key not in LANG_CODE_MAP:
        print(f"Error: Language keys '{src_key}' or '{tgt_key}' not found in LANG_CODE_MAP.")
        return None, None

    src_lang_code = LANG_CODE_MAP[src_key]
    tgt_lang_code = LANG_CODE_MAP[tgt_key]

    num_examples = min(len(test_data), max_examples)
    print(f"Evaluating on {num_examples} test examples...")

    results = []
    successful_translations = 0
    for i in range(num_examples):
        example = test_data[i]
        if "translation" not in example or src_key not in example["translation"] or tgt_key not in example["translation"]:
            print(f"Warning: Skipping example {i} due to missing keys.")
            continue

        source_text = example["translation"][src_key]
        reference_translation = example["translation"][tgt_key]

        fine_tuned_trans = translate(source_text, ft_model, tokenizer, src_lang_code, tgt_lang_code)
        original_trans = translate(source_text, orig_model, tokenizer, src_lang_code, tgt_lang_code)

        is_ft_error = "[Translation Error" in fine_tuned_trans # Check if error placeholder returned
        is_orig_error = "[Translation Error" in original_trans

        if not is_ft_error and not is_orig_error:
            results.append({
                "source": source_text,
                "reference": reference_translation,
                "fine_tuned": fine_tuned_trans,
                "original": original_trans
            })
            successful_translations += 1
        else:
             print(f"Skipping results for example {i} due to translation error (FT: {is_ft_error}, Orig: {is_orig_error}).")

    if not results:
        print("No successful translations were generated for evaluation.")
        return None, None

    print(f"Successfully translated {successful_translations}/{num_examples} examples.")
    results_df = pd.DataFrame(results)

    # Calculate metrics
    predictions_ft = results_df["fine_tuned"].tolist()
    predictions_orig = results_df["original"].tolist()
    references_list = [[ref] for ref in results_df["reference"].tolist()]

    metrics_summary = {}
    try:
        ft_bleu = bleu_metric.compute(predictions=predictions_ft, references=references_list)['score']
        orig_bleu = bleu_metric.compute(predictions=predictions_orig, references=references_list)['score']

        references_flat = results_df["reference"].tolist()
        ft_chrf = chrf_metric.compute(predictions=predictions_ft, references=references_flat)['score']
        orig_chrf = chrf_metric.compute(predictions=predictions_orig, references=references_flat)['score']

        ft_ter = ter_metric.compute(predictions=predictions_ft, references=references_flat)['score']
        orig_ter = ter_metric.compute(predictions=predictions_orig, references=references_flat)['score']

        metrics_summary = {
            "BLEU": {"Fine-tuned": ft_bleu, "Original": orig_bleu},
            "chrF": {"Fine-tuned": ft_chrf, "Original": orig_chrf},
            "TER": {"Fine-tuned": ft_ter, "Original": orig_ter}
        }

        print("\nModel Comparison Metrics:")
        print(f"BLEU:  Fine-tuned: {ft_bleu:.2f}  Original: {orig_bleu:.2f}")
        print(f"chrF:  Fine-tuned: {ft_chrf:.2f}  Original: {orig_chrf:.2f}")
        print(f"TER:   Fine-tuned: {ft_ter:.2f}  Original: {orig_ter:.2f}  (lower is better)")

    except Exception as e:
        print(f"Error calculating metrics: {e}")
        return results_df, None # Return results but indicate metric error


    # Save results
    results_filename = f"model_comparison_results_{src_key}_to_{tgt_key}.csv"
    results_df.to_csv(results_filename, index=False)
    print(f"Saved comparison results to {results_filename}")

    # Create comparison visualization (only if metrics were calculated)
    if metrics_summary:
        metrics_labels = ["BLEU", "chrF", "TER"]
        ft_scores = [metrics_summary["BLEU"]["Fine-tuned"], metrics_summary["chrF"]["Fine-tuned"], metrics_summary["TER"]["Fine-tuned"]]
        orig_scores = [metrics_summary["BLEU"]["Original"], metrics_summary["chrF"]["Original"], metrics_summary["TER"]["Original"]]

        plt.figure(figsize=(10, 6))
        x = np.arange(len(metrics_labels))
        width = 0.35

        plt.bar(x - width/2, ft_scores, width, label='Fine-tuned Model')
        plt.bar(x + width/2, orig_scores, width, label='Original Model')

        plt.ylabel('Score')
        plt.title(f'Model Performance Comparison ({src_key.upper()} -> {tgt_key.upper()})')
        plt.xticks(x, metrics_labels)
        plt.legend()
        plt.figtext(0.5, 0.01, "Note: For TER, lower scores are better",
                    horizontalalignment='center', fontsize=10)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        plot_filename = f"model_comparison_{src_key}_to_{tgt_key}.png"
        plt.savefig(plot_filename)
        print(f"Saved comparison plot to {plot_filename}")
        plt.close()

    # Display example translations
    print("\nExample Translations:")
    for i in range(min(5, len(results_df))):
        print(f"\nExample {i+1}:")
        print(f"Source ({src_key}): {results_df.iloc[i]['source']}")
        print(f"Reference ({tgt_key}): {results_df.iloc[i]['reference']}")
        print(f"Fine-tuned: {results_df.iloc[i]['fine_tuned']}")
        print(f"Original:   {results_df.iloc[i]['original']}")

    return results_df, metrics_summary


# --- Human Evaluation Function --- (human_evaluation_samples remains largely the same)
def human_evaluation_samples(ft_model, orig_model, tokenizer, test_data, num_samples=10):
    """
    Generates translations for a sample of sentences in both directions
    for manual human review.
    """
    print("\n--- Generating Samples for Human Evaluation ---")

    # --- Add Checks for Model Types ---
    if not isinstance(ft_model, PreTrainedModel):
        print(f"ERROR: Fine-tuned model provided to human_evaluation_samples is not a valid model. Type: {type(ft_model)}")
        return
    if not isinstance(orig_model, PreTrainedModel):
        print(f"ERROR: Original model provided to human_evaluation_samples is not a valid model. Type: {type(orig_model)}")
        return
    # --- End Checks ---

    if not test_data:
        print("No test data available for human evaluation.")
        return

    try:
        keys = list(test_data[0]["translation"].keys())
        if len(keys) != 2:
            print("Error: Expected exactly two language keys in dataset 'translation' field.")
            return
        key1, key2 = keys[0], keys[1]
        print(f"Using language keys for human eval: {key1}, {key2}")
    except (KeyError, IndexError, TypeError):
        print("Error: Could not determine language keys from test_data[0]['translation'].")
        return

    if key1 not in LANG_CODE_MAP or key2 not in LANG_CODE_MAP:
         print(f"Error: Language keys '{key1}' or '{key2}' not found in LANG_CODE_MAP for human eval.")
         return

    lang_code1 = LANG_CODE_MAP[key1]
    lang_code2 = LANG_CODE_MAP[key2]

    num_available = len(test_data)
    if num_available == 0:
        print("Test data is empty.")
        return

    actual_num_samples = min(num_samples, num_available)
    if actual_num_samples <= 0:
        print("No samples requested or available for human evaluation.")
        return

    # Select random indices using the imported random module
    sample_indices = random.sample(range(num_available), actual_num_samples)

    print(f"\n--- {key1.upper()} -> {key2.upper()} Translation Samples ({actual_num_samples} samples)---")
    for i, index in enumerate(sample_indices):
        example = test_data[index]
        source_text = example["translation"][key1]
        reference_text = example["translation"][key2]

        ft_translation = translate(source_text, ft_model, tokenizer, lang_code1, lang_code2)
        orig_translation = translate(source_text, orig_model, tokenizer, lang_code1, lang_code2)

        print(f"\nSample {i+1} ({key1} -> {key2}):")
        print(f"  Source ({key1}):    {source_text}")
        print(f"  Reference ({key2}): {reference_text}")
        print(f"  Fine-tuned:  {ft_translation}")
        print(f"  Original:    {orig_translation}")
        print("-" * 20)

    print(f"\n--- {key2.upper()} -> {key1.upper()} Translation Samples ({actual_num_samples} samples)---")
    for i, index in enumerate(sample_indices):
        example = test_data[index]
        source_text = example["translation"][key2]
        reference_text = example["translation"][key1]

        ft_translation = translate(source_text, ft_model, tokenizer, lang_code2, lang_code1)
        orig_translation = translate(source_text, orig_model, tokenizer, lang_code2, lang_code1)

        print(f"\nSample {i+1} ({key2} -> {key1}):")
        print(f"  Source ({key2}):    {source_text}")
        print(f"  Reference ({key1}): {reference_text}")
        print(f"  Fine-tuned:  {ft_translation}")
        print(f"  Original:    {orig_translation}")
        print("-" * 20)


# --- Main Execution ---
def run_evaluation():
    """Loads models and runs the full evaluation suite."""
    print("Running comprehensive model evaluation...")

    # Load models
    ft_model = AutoModelForSeq2SeqLM.from_pretrained(TUNNED_MODEL_NAME)
    orig_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

    # Load tokenizer if needed (if not already loaded)
    # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # Validation checks
    print("Checking loaded model types before evaluation...")
    if not isinstance(ft_model, PreTrainedModel):
        print(f"CRITICAL ERROR: ft_model is not a valid model. Type: {type(ft_model)}")
        return
    if not isinstance(orig_model, PreTrainedModel):
        print(f"CRITICAL ERROR: orig_model is not a valid model. Type: {type(orig_model)}")
        return
    print("Model types verified.")

    # IMPORTANT: Load your dataset directly here rather than using a global variable
    # This ensures we're using the real dataset and not any dummy data
    from datasets import load_dataset
    real_dataset = load_dataset("XSkills/turkmen_english_s500")
    test_dataset = real_dataset["test"]  # You can also use validation if preferred

    # Print a sample to verify we're using the real dataset
    print("Verifying real test data is loaded:")
    for i in range(min(2, len(test_dataset))):
        print(f"Sample {i} (first 50 chars): {test_dataset[i]['translation']['en'][:50]}...")

    # Determine language direction
    try:
        example_keys = list(test_dataset[0]["translation"].keys())
        if len(example_keys) != 2:
            raise ValueError("Dataset must contain exactly two language keys.")
        src_key, tgt_key = example_keys[0], example_keys[1]
        print(f"Detected language keys for evaluation: {src_key} -> {tgt_key}")
    except (IndexError, KeyError, TypeError, ValueError) as e:
        print(f"Error determining language keys from dataset: {e}")
        return

    # Run evaluation
    evaluate_model_performance(ft_model, orig_model, tokenizer, test_dataset, src_key, tgt_key)
    print("\nChecking for reverse direction evaluation...")
    evaluate_model_performance(ft_model, orig_model, tokenizer, test_dataset, tgt_key, src_key)

    # Generate human evaluation samples
    human_evaluation_samples(ft_model, orig_model, tokenizer, test_dataset, num_samples=10)

    print("\nEvaluation complete!")

## Run the evaluation


In [None]:
print("\nRunning comprehensive model evaluation...")
run_evaluation()


print("\nTraining and evaluation complete!")


Running comprehensive model evaluation...
Running comprehensive model evaluation...
Checking loaded model types before evaluation...
Model types verified.
Verifying real test data is loaded:
Sample 0 (first 50 chars): A report was also made on the practical steps take...
Sample 1 (first 50 chars): The plan for the production of carpet items by the...
Detected language keys for evaluation: en -> tk

--- Evaluating Translation: en -> tk ---
Evaluating on 50 test examples...
Successfully translated 50/50 examples.

Model Comparison Metrics:
BLEU:  Fine-tuned: 8.24  Original: 8.12
chrF:  Fine-tuned: 39.55  Original: 39.46
TER:   Fine-tuned: 87.20  Original: 87.30  (lower is better)
Saved comparison results to model_comparison_results_en_to_tk.csv
Saved comparison plot to model_comparison_en_to_tk.png

Example Translations:

Example 1:
Source (en): A report was also made on the practical steps taken to widely use the digital system and improve the professionalism of the employees of the pr