In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
from datasets import Dataset
from transformers import AutoTokenizer
import os

def load_and_process_xquad(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []

    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answer = qa["answers"][0]["text"]

                # Create Q&A format with special tokens
                formatted_text = (
                    f"<|begin_of_text|><sah>Контекст: {context}\n"
                    f"Соруйаан: {question}\n"
                    f"Эппиэт: {answer}<|end_of_text|>"
                )

                # Also store components separately for evaluation
                processed_data.append({
                    "context": context,
                    "question": question,
                    "answer": answer,
                    "text": formatted_text,
                    "context_length": len(tokenizer.encode(context))
                })

    return processed_data

# Load tokenizer and process dataset
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/yakut-llama2-finetuned")
xquad_data = load_and_process_xquad("/content/drive/MyDrive/Phase3_train_data/xquad_train_sah.json", tokenizer)
dataset = Dataset.from_dict({
    "text": [d["text"] for d in xquad_data],
    "context": [d["context"] for d in xquad_data],
    "question": [d["question"] for d in xquad_data],
    "answer": [d["answer"] for d in xquad_data],
    "context_length": [d["context_length"] for d in xquad_data]
})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=1024, 
        padding="max_length",
    )

# Apply tokenization
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text", "context", "question", "answer"]
)

Map (num_proc=4):   0%|          | 0/952 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
!pip install transformers accelerate --upgrade




In [None]:
import json
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np

# 1. Load model and tokenizer
model_path = "/content/drive/MyDrive/yakut-llama2-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# 2. Enable RoPE scaling for longer contexts
model.config.rope_scaling = {
    "type": "dynamic",  # Dynamic NTK-aware scaling
    "factor": 2.0       # Extend context length by 2x
}
print(f"Updated RoPE scaling: {model.config.rope_scaling}")

# 3. Load and process XQuAD dataset
def load_and_process_xquad(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []

    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answer = qa["answers"][0]["text"]

                # Create Q&A format with special tokens
                formatted_text = (
                    f"<|begin_of_text|><sah>Контекст: {context}\n"
                    f"Соруйаан: {question}\n"
                    f"Эппиэт: {answer}<|end_of_text|>"
                )

                # Store components
                processed_data.append({
                    "text": formatted_text,
                    "context": context,
                    "question": question,
                    "answer": answer
                })

    return processed_data

# Load dataset
xquad_data = load_and_process_xquad("/content/drive/MyDrive/Phase3_train_data/xquad_train_sah.json")
dataset = Dataset.from_list(xquad_data)

# 4. Tokenization function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=4096,  # Increased context length
        padding=False
    )

    # Add answer start positions
    tokenized["answer_starts"] = []
    for text in examples["text"]:
        # Find position of "Эппиэт:" in the text
        answer_start = text.find("Эппиэт:") + len("Эппиэт:")
        # Convert to token position
        char_to_token = tokenized.char_to_token(0, answer_start)
        tokenized["answer_starts"].append(char_to_token if char_to_token is not None else 0)

    return tokenized

# Apply tokenization
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=4,
    num_proc=4,
    remove_columns=["text", "context", "question", "answer"]
)

# 5. Data Collator with Answer Focus
class QADataCollator(DataCollatorForLanguageModeling):
    def __call__(self, features):
        batch = super().__call__(features)

        # Create loss mask to focus only on answer part
        loss_mask = torch.zeros_like(batch["input_ids"])
        for i, feature in enumerate(features):
            if "answer_starts" in feature and feature["answer_starts"] > 0:
                start_idx = feature["answer_starts"]
                loss_mask[i, start_idx:] = 1
            else:
                # If answer start not found, use whole sequence
                loss_mask[i, :] = 1

        batch["labels"] = batch["input_ids"] * loss_mask
        return batch

# 6. Training Configuration (compatible with older versions)
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/yakut-qa-finetuned",
    num_train_epochs=10, 
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    warmup_steps=200,
    weight_decay=0.01,
    fp16=True,
    gradient_checkpointing=True,
    logging_steps=50,
    save_steps=500,
    eval_strategy="no",  # Disable evaluation during training
    load_best_model_at_end=False,
    report_to="none",
    dataloader_num_workers=4,
    max_grad_norm=1.0,
)

# Initialize data collator
data_collator = QADataCollator(
    tokenizer=tokenizer,
    mlm=False
)

# Split dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_test_split["train"],
)

# 7. Start training
print("Starting Q&A fine-tuning...")
trainer.train()
trainer.save_model("/content/drive/MyDrive/yakut-qa-finetuned")
print("Fine-tuning complete!")

# 8. Q&A Generation Function
def generate_answer(model, tokenizer, context, question, max_length=256):
    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=4096,
        truncation=True
    ).to(model.device)

    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=0.7,
            top_k=40,
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
        )

    # Extract only the answer part
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    answer_start = full_output.find("Эппиэт:") + len("Эппиэт:")
    answer = full_output[answer_start:].split("<|end_of_text|>")[0].strip()

    return answer

Updated RoPE scaling: {'type': 'dynamic', 'factor': 2.0}


Map (num_proc=4):   0%|          | 0/952 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Starting Q&A fine-tuning...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,4.0175
100,0.3002
150,0.097
200,0.0833
250,0.0651


Fine-tuning complete!


In [None]:
test_context = "<|begin_of_text|> <sah> Пантердар көмүскэллэрэ 308 эрэ очукуону биэрэн, лигаҕа алтыс миэстэҕэ таҕыста, ону тэҥэ НФЛ-ы 24 очукуолаах уонна түөрт Про Боул талыытынан киэн туттар. Про Боул көмүскэнэр атааката Каванн Шорт 11 охсуунан хамаанданы салайбыта, ону тэҥэ үс охсууну күһэйбитэ уонна иккини ылбыта. Бииргэ үлэлиир линейщик Марио Эддисон 61⁄2 мешогу эбии киллэрдэ. Пантерс линиятыгар өссө бэтэрээн көмүскэнэр Джаред Аллен, 5 төгүллээх профессиональнай боулер, НФЛ 136-нан актыыбынай карьератын салайааччыта, уонна көмүскэнэр Кони Или, 9 эрэ стартка 5 сактаах этэ. Кинилэр кэннилэриттэн, Пантерс үс стартовай лайнбекердарыттан иккитэ эмиэ Про Боулга оонньуурга талыллыбыттара: Томас Дэвис уонна Люк Куечли. Дэвис 51⁄2 сактары, түөрт күһэллибит охсууну уонна түөрт охсууну хомуйбута, оттон Куечли хамаанданы охсуһууга (118) салайбыта, икки күһэллибит охсууну оҥорбута уонна бэйэтин түөрт пастарын быһа охсубута. Каролина иккис хамаандатыгар Pro Bowl куттала суох буолуутун Курт Коулман кыттыбыта, кини хамаанданы карьератын үрдүк көрдөрүүтүнэн сэттэ перехваттаах салайбыта, ону тэҥэ 88 атааканы оҥорбута уонна Pro Bowl угловой защитнига Джош Норман, кини сезоҥҥа хааччахтааһын муннугар кубулуйбута уонна түөрт перехваттаах этэ, олортон иккитэ тачдауннарга төннүбүттэрэ."

test_question = " <|begin_of_text|> <sah> Бу сезоҥҥа хамаандаҕа ким саамай элбэх мешогу суруйда?"

answer = generate_answer(model, tokenizer, test_context, test_question)
print(f"Question: {test_question}")
print(f"Answer: {answer}")




Question:  <|begin_of_text|> <sah> Бу сезоҥҥа хамаандаҕа ким саамай элбэх мешогу суруйда?
Answer: КаваннШорт


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import re

# 1. Enhanced Yakut Text Reconstruction
def reconstruct_yakut_text(text):
    """Reconstruct Yakut text with proper word boundaries"""
    # Step 1: Add spaces around special tokens
    text = re.sub(r'(<\|[^>]+\|>)', r' \1 ', text)

    # Step 2: Handle known Yakut words and patterns
    yakut_words = [
        "хаһан", "туох", "хэн", "хайдах", "тоҕо", "ылыллыбытай", "сыллаах",
        "сэриитин", "Америка", "Европаҕа", "Монреаль", "Саха", "Республикатын",
        "Дьокуускай", "куоратка", "баар", "Хотугу", "кыргыһыытыттан", "түһэрсиигэ",
        "сыллаахха", "ылыллыбытай"
    ]

    for word in sorted(yakut_words, key=len, reverse=True):
        pattern = re.compile(re.escape(word), re.IGNORECASE)
        text = pattern.sub(r' \g<0> ', text)

    # Step 3: Handle numbers and common suffixes
    text = re.sub(r'(\d+)([а-яё])', r'\1 \2', text)  
    text = re.sub(r'([а-яё])(\d+)', r'\1 \2', text) 
    # Step 4: Handle punctuation
    text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text) 
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)         

    # Step 5: Clean up spaces
    text = re.sub(r'\s+', ' ', text) 

    return text.strip()

# 2. Improved Q&A Generation Function
def generate_qa_answer(model, tokenizer, context, question, max_new_tokens=30):
    # Build prompt with proper spacing
    prompt = (
        f"<|begin_of_text|> <sah> Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Эппиэт:"
    )

    # Tokenize with truncation for long contexts
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=2048,
        truncation=True
    ).to(model.device)

    # Explicitly remove token_type_ids if present
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7, 
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            num_beams=5,  
            early_stopping=True,
            do_sample=True,
        )

    # Decode only the generated part
    input_length = inputs.input_ids.shape[1]
    generated_ids = outputs[0][input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # Remove any trailing special tokens
    generated_text = generated_text.replace("<|end_of_text|>", "").strip()

    # Apply Yakut text reconstruction
    return reconstruct_yakut_text(generated_text)

# 3. Load Model and Tokenizer
model_path = "/content/drive/MyDrive/yakut-qa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

# 4. Test with your example
context = (
    "Пантердар көмүскэллэрэ 308 эрэ очукуону биэрэн, лигаҕа алтыс миэстэҕэ таҕыста, ону тэҥэ НФЛ-ы 24 очукуолаах уонна түөрт Про Боул талыытынан киэн туттар. Про Боул көмүскэнэр атааката Каванн Шорт 11 охсуунан хамаанданы салайбыта, ону тэҥэ үс охсууну күһэйбитэ уонна иккини ылбыта. Бииргэ үлэлиир линейщик Марио Эддисон 61⁄2 мешогу эбии киллэрдэ. Пантерс линиятыгар өссө бэтэрээн көмүскэнэр Джаред Аллен, 5 төгүллээх профессиональнай боулер, НФЛ 136-нан актыыбынай карьератын салайааччыта, уонна көмүскэнэр Кони Или, 9 эрэ стартка 5 сактаах этэ. Кинилэр кэннилэриттэн, Пантерс үс стартовай лайнбекердарыттан иккитэ эмиэ Про Боулга оонньуурга талыллыбыттара: Томас Дэвис уонна Люк Куечли. Дэвис 51⁄2 сактары, түөрт күһэллибит охсууну уонна түөрт охсууну хомуйбута, оттон Куечли хамаанданы охсуһууга (118) салайбыта, икки күһэллибит охсууну оҥорбута уонна бэйэтин түөрт пастарын быһа охсубута. Каролина иккис хамаандатыгар Pro Bowl куттала суох буолуутун Курт Коулман кыттыбыта, кини хамаанданы карьератын үрдүк көрдөрүүтүнэн сэттэ перехваттаах салайбыта, ону тэҥэ 88 атааканы оҥорбута уонна Pro Bowl угловой защитнига Джош Норман, кини сезоҥҥа хааччахтааһын муннугар кубулуйбута уонна түөрт перехваттаах этэ, олортон иккитэ тачдауннарга төннүбүттэрэ."
)
question = "Пантердар көмүскэллэрэ хас очукуону биэрдэ?"

# Generate and print answer
answer = generate_qa_answer(model, tokenizer, context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")

# 5. Test tokenizer decoding
test_cases = [
    "<|begin_of_text|><sah>Монреальхаһанылыллыбытай?",
    "<|begin_of_text|><sah>СахаРеспубликатынкиинэтэДьокуускайкуораткабаар.",
    "<|begin_of_text|><sah>1760сыллаахха",
    "<|begin_of_text|><sah>ХотугуАмерикасэттэсыллаахсэриитинтыйаатырыгар"
]

print("\nTokenizer reconstruction tests:")
for test_text in test_cases:
    decoded = tokenizer.decode(tokenizer.encode(test_text))
    fixed = reconstruct_yakut_text(decoded)
    print(f"Original: {decoded}")
    print(f"Fixed:    {fixed}")
    print("-" * 50)

Question: Монреаль хаһан ылыллыбытай?
Answer: 1756

Tokenizer reconstruction tests:
Original: <|begin_of_text|><sah>Монреальхаһанылыллыбытай?
Fixed:    <|begin_of_text|> <sah> Монреаль хаһан ылыллыбытай?
--------------------------------------------------
Original: <|begin_of_text|><sah>СахаРеспубликатынкиинэтэДьокуускайкуораткабаар.
Fixed:    <|begin_of_text|> <sah> Саха Республикатын киинэтэ Дьокуускай куоратка баар.
--------------------------------------------------
Original: <|begin_of_text|><sah>1760сыллаахха
Fixed:    <|begin_of_text|> <sah>1760 сыллаах ха
--------------------------------------------------
Original: <|begin_of_text|><sah>ХотугуАмерикасэттэсыллаахсэриитинтыйаатырыгар
Fixed:    <|begin_of_text|> <sah> Хотугу Америка сэттэ сыллаах сэриитин тыйаатырыгар
--------------------------------------------------


In [None]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Enhanced Yakut Text Reconstruction
def reconstruct_yakut_text(text):
    """Reconstruct Yakut text with proper word boundaries"""
    # Step 1: Add spaces around special tokens
    text = re.sub(r'(<\|[^>]+\|>)', r' \1 ', text)

    # Step 2: Handle known Yakut words and patterns
    yakut_words = [
        "хаһан", "туох", "хэн", "хайдах", "тоҕо", "ылыллыбытай", "сыллаах",
        "сэриитин", "Америка", "Европаҕа", "Монреаль", "Саха", "Республикатын",
        "Дьокуускай", "куоратка", "баар", "Хотугу", "кыргыһыытыттан", "түһэрсиигэ",
        "сыллаахха", "ылыллыбытай", "контекст", "соруйаан", "эппиэт"
    ]

    for word in sorted(yakut_words, key=len, reverse=True):
        pattern = re.compile(re.escape(word), re.IGNORECASE)
        text = pattern.sub(r' \g<0> ', text)

    # Step 3: Handle numbers and common suffixes
    text = re.sub(r'(\d+)([а-яё])', r'\1 \2', text)  
    text = re.sub(r'([а-яё])(\d+)', r'\1 \2', text)  

    # Step 4: Handle punctuation
    text = re.sub(r'([.,!?;:])([^\s])', r'\1 \2', text)  
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)        

    # Step 5: Clean up spaces
    text = re.sub(r'\s+', ' ', text) 

    return text.strip()

# 2. Improved Q&A Generation Function
def generate_qa_answer(model, tokenizer, context, question, max_new_tokens=30):
    # Build prompt
    prompt = (
        f"<|begin_of_text|> <sah> Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Эппиэт:"
    )

    # Tokenize with truncation for long contexts
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=2048,
        truncation=True
    ).to(model.device)

    # **Explicitly remove token_type_ids if present**
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']


    # Generate answer
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_k=40,
            top_p=0.95,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            num_beams=5,
            early_stopping=True,
            do_sample=True,
        )

    # Decode only the generated part
    input_length = inputs.input_ids.shape[1]
    generated_ids = outputs[0][input_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    # Remove any trailing special tokens
    generated_text = generated_text.replace("<|end_of_text|>", "").strip()

    # Apply Yakut text reconstruction
    return reconstruct_yakut_text(generated_text)

# 3. Evaluation Metrics
def normalize_text(text):
    """Normalize text for evaluation"""
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = re.sub(r'[.,!?;:()\[\]{}"\'«»]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def calculate_em(prediction, ground_truth):
    """Calculate Exact Match score"""
    pred_norm = normalize_text(prediction)
    gt_norm = normalize_text(ground_truth)
    return int(pred_norm == gt_norm)

def calculate_f1(prediction, ground_truth):
    """Calculate token-level F1 score"""
    pred_tokens = normalize_text(prediction).split()
    gt_tokens = normalize_text(ground_truth).split()

    # If both are empty, return 1.0
    if len(pred_tokens) == 0 and len(gt_tokens) == 0:
        return 1.0

    # If one is empty, return 0.0
    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0

    # Calculate common tokens
    common_tokens = set(pred_tokens) & set(gt_tokens)

    # Calculate precision and recall
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(gt_tokens)

    # Calculate F1
    if precision + recall > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return f1

# 4. Load Model and Tokenizer
model_path = "/content/drive/MyDrive/yakut-qa-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

# 5. Load XQuAD Dataset
def load_xquad_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    dataset = []
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                dataset.append({
                    "id": qa["id"],
                    "context": context,
                    "question": qa["question"],
                    "answer": qa["answers"][0]["text"]
                })
    return dataset

xquad_path = "/content/drive/MyDrive/Phase3_train_data/xquad_test_sah.json" 
xquad_dataset = load_xquad_dataset(xquad_path)
print(f"Loaded {len(xquad_dataset)} QA pairs")

# 6. Run Evaluation
results = []
total_em = 0
total_f1 = 0

for item in tqdm(xquad_dataset, desc="Evaluating"):
    try:
        # Generate answer
        pred_answer = generate_qa_answer(
            model,
            tokenizer,
            item["context"],
            item["question"]
        )

        # Calculate metrics
        em = calculate_em(pred_answer, item["answer"])
        f1 = calculate_f1(pred_answer, item["answer"])

        # Store results
        results.append({
            "id": item["id"],
            "context": item["context"],
            "question": item["question"],
            "predicted_answer": pred_answer,
            "true_answer": item["answer"],
            "em": em,
            "f1": f1
        })

        # Accumulate scores
        total_em += em
        total_f1 += f1

    except Exception as e:
        print(f"Error processing {item['id']}: {str(e)}")
        results.append({
            "id": item["id"],
            "error": str(e)
        })

# 7. Calculate Final Scores
num_samples = len([r for r in results if "em" in r])
avg_em = total_em / num_samples if num_samples > 0 else 0
avg_f1 = total_f1 / num_samples if num_samples > 0 else 0

print("\nEvaluation Results:")
print(f"Exact Match (EM): {avg_em:.4f}")
print(f"F1 Score: {avg_f1:.4f}")

# 8. Save Detailed Results
results_path = "xquad_evaluation_results.json"
with open(results_path, 'w', encoding='utf-8') as f:
    json.dump({
        "metrics": {"em": avg_em, "f1": avg_f1},
        "details": results
    }, f, ensure_ascii=False, indent=2)

print(f"Detailed results saved to {results_path}")

# 9. Print Sample Results
print("\nSample Predictions:")
for i, result in enumerate(results[:5]):
    if "error" not in result:
        print(f"Question: {result['question']}")
        print(f"True Answer: {result['true_answer']}")
        print(f"Predicted Answer: {result['predicted_answer']}")
        print(f"EM: {result['em']}, F1: {result['f1']:.4f}")
        print("-" * 80)

Loaded 238 QA pairs


Evaluating: 100%|██████████| 238/238 [01:50<00:00,  2.15it/s]


Evaluation Results:
Exact Match (EM): 0.0588
F1 Score: 0.0841
Detailed results saved to xquad_evaluation_results.json

Sample Predictions:
Question: Монреаль хаһан ылыллыбытай?
True Answer: 1760
Predicted Answer: 1754
EM: 0, F1: 0.0000
--------------------------------------------------------------------------------
Question: Дойду тугунан биллэрий?
True Answer: орто уонна уһун дистанциялаах чэпчэки атлетикаҕа баһыйар оруола
Predicted Answer: Кения
EM: 0, F1: 0.0000
--------------------------------------------------------------------------------
Question: Ханнык хамаанда МЛС-тан уһуллубутай?
True Answer: Чивас
Predicted Answer: 2018
EM: 0, F1: 0.0000
--------------------------------------------------------------------------------
Question: 2005 сылтан ыла, Доктор Ким сүрүн айанныыр киһитэ хайдаҕый?
True Answer: дьахтар
Predicted Answer: СтивенМоффат
EM: 0, F1: 0.0000
--------------------------------------------------------------------------------
Question: Культураҕа өрөбөлүүссүйэлээх 




In [None]:
# Print all question/true answer and generated answer pairs
print("\nAll Question/Answer Pairs:")
for result in results:
    if "error" not in result:
        print(f"Question: {result['question']}")
        print(f"True Answer: {result['true_answer']}")
        print(f"Predicted Answer: {result['predicted_answer']}")
        print("-" * 80)
    else:
        print(f"Skipping item {result['id']} due to error: {result['error']}")
        print("-" * 80)


All Question/Answer Pairs:
Question: Монреаль хаһан ылыллыбытай?
True Answer: 1760
Predicted Answer: 1754
--------------------------------------------------------------------------------
Question: Дойду тугунан биллэрий?
True Answer: орто уонна уһун дистанциялаах чэпчэки атлетикаҕа баһыйар оруола
Predicted Answer: Кения
--------------------------------------------------------------------------------
Question: Ханнык хамаанда МЛС-тан уһуллубутай?
True Answer: Чивас
Predicted Answer: 2018
--------------------------------------------------------------------------------
Question: 2005 сылтан ыла, Доктор Ким сүрүн айанныыр киһитэ хайдаҕый?
True Answer: дьахтар
Predicted Answer: СтивенМоффат
--------------------------------------------------------------------------------
Question: Культураҕа өрөбөлүүссүйэлээх гражданскай истибэт буолуу ким холобурунан бэлиэтэнэр?
True Answer: Ганди
Predicted Answer: Венгрдар
--------------------------------------------------------------------------------
Qu

In [4]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict

# Configuration
MODEL_PATH = "meta-llama/Llama-3.2-1B"
DATASET_PATH = "/content/drive/MyDrive/Phase3_train_data/xquad_test_sah.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TORCH_DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
token = ""  #Token removed due to obvious reasons. 

# 1. Text Processing Functions
def normalize_yakut_text(text):
    """Normalize Yakut text for comparison"""
    text = text.lower()
    text = re.sub(r'[.,!?;:]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# 2. Model Generation Function for Open-Ended QA
def generate_qa_answer(model, tokenizer, context, question):
    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,  # Increased for open-ended answers
            temperature=0.7,    
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
        )

    # Extract just the generated answer
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = full_text.find("Эппиэт:") + len("Эппиэт:")
    answer_text = full_text[answer_start:].split('<|end_of_text|>')[0].strip()

    return answer_text

# 3. Evaluation Functions for Open-Ended QA
def evaluate_predictions(dataset, predictions):
    results = {
        'total': len(dataset),
        'exact_match': 0,
        'f1_score': 0,
        'per_question_type': defaultdict(lambda: {'exact_match': 0, 'total': 0}),
    }

    for item, pred in zip(dataset, predictions):
        question_type = item['question'].split()[0]  # First word as question type
        results['per_question_type'][question_type]['total'] += 1

        # Normalize both prediction and answer
        norm_pred = normalize_yakut_text(pred)
        norm_answer = normalize_yakut_text(item['answer'])

        # Exact match evaluation
        if norm_pred == norm_answer:
            results['exact_match'] += 1
            results['per_question_type'][question_type]['exact_match'] += 1

        # F1 score calculation (simplified)
        pred_tokens = norm_pred.split()
        answer_tokens = norm_answer.split()

        common_tokens = set(pred_tokens) & set(answer_tokens)
        precision = len(common_tokens) / len(pred_tokens) if pred_tokens else 0
        recall = len(common_tokens) / len(answer_tokens) if answer_tokens else 0

        if (precision + recall) > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0

        results['f1_score'] += f1

    # Calculate averages
    results['exact_match'] /= results['total']
    results['f1_score'] /= results['total']

    # Calculate accuracy per question type
    for q_type in results['per_question_type']:
        q_stats = results['per_question_type'][q_type]
        q_stats['exact_match'] = q_stats['exact_match'] / q_stats['total'] if q_stats['total'] > 0 else 0

    return results

# 4. Load and Process XQuAD Dataset
def load_xquad_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed_data = []

    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answer = qa["answers"][0]["text"]

                processed_data.append({
                    'context': context,
                    'question': question,
                    'answer': answer,
                    'id': qa["id"]
                })

    return processed_data

# 5. Main Evaluation Pipeline
def evaluate_model():
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, token=token)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        token=token,
        torch_dtype=TORCH_DTYPE
    ).to(DEVICE)

    # Load and process XQuAD dataset
    dataset = load_xquad_dataset(DATASET_PATH)

    # Generate predictions
    predictions = []
    for item in tqdm(dataset, desc="Evaluating XQuAD"):
        try:
            pred = generate_qa_answer(
                model, tokenizer,
                item['context'],
                item['question']
            )
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing item {item['id']}: {str(e)}")
            predictions.append("[ERROR]")

    # Evaluate results
    evaluation_results = evaluate_predictions(dataset, predictions)

    return evaluation_results, predictions, dataset

# 6. Run and Report Results
if __name__ == "__main__":
    # Call evaluate_model and receive the dataset
    results, predictions, original_dataset = evaluate_model()

    print("\nEvaluation Results:")
    print(f"Model: {MODEL_PATH}")
    print(f"Dataset: XQuAD Sakha")
    print(f"Device: {DEVICE}")
    print(f"Total Questions: {results['total']}")
    print(f"Exact Match: {results['exact_match']:.2%}")
    print(f"Average F1 Score: {results['f1_score']:.2%}")

    print("\nPerformance by Question Type:")
    for q_type, stats in sorted(results['per_question_type'].items()):
        print(f"{q_type}: EM {stats['exact_match']:.2%} ({stats['exact_match']*stats['total']:.0f}/{stats['total']})")

    # Save detailed results
    output = {
        'model': MODEL_PATH,
        'dataset': "XQuAD Sakha",
        'config': {
            'device': DEVICE,
            'dtype': str(TORCH_DTYPE)
        },
        'metrics': {
            'exact_match': results['exact_match'],
            'f1_score': results['f1_score'],
            'total_questions': results['total']
        },
        'per_question_type': results['per_question_type'],
        'predictions': [
            {
                'id': item['id'],
                'question': item['question'],
                'context': item['context'],
                'predicted': pred,
                'true_answer': item['answer'],
                'is_correct': normalize_yakut_text(pred) == normalize_yakut_text(item['answer'])
            }
            for item, pred in zip(original_dataset, predictions)
        ]
    }

    with open('llama3_xquad_results.json', 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("\nSample Predictions:")
    for i, (item, pred) in enumerate(zip(original_dataset[:3], predictions[:3])):
        print(f"\nQuestion {item['id']}: {item['question']}")
        print(f"Context: {item['context'][:100]}...")
        print(f"Predicted: {pred}")
        print(f"Correct: {item['answer']}")
        print(f"Result: {'✓' if normalize_yakut_text(pred) == normalize_yakut_text(item['answer']) else '✗'}")

    print("\nEvaluation complete. Detailed results saved to 'llama3_xquad_results.json'")

Evaluating XQuAD:   0%|          | 0/238 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   0%|          | 1/238 [00:01<04:16,  1.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   1%|          | 2/238 [00:02<04:13,  1.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   1%|▏         | 3/238 [00:03<04:09,  1.06s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   2%|▏         | 4/238 [00:04<04:12,  1.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   2%|▏         | 5/238 [00:05<04:14,  1.09s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   3%|▎         | 6/238 [00:06<04:10,  1.08s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating XQuAD:   3%|▎         | 7/238 [


Evaluation Results:
Model: meta-llama/Llama-3.2-1B
Dataset: XQuAD Sakha
Device: cuda
Total Questions: 238
Exact Match: 0.00%
Average F1 Score: 1.20%

Performance by Question Type:
"Тыһыынча: EM 0.00% (0/1)
1700-с: EM 0.00% (0/1)
1915: EM 0.00% (0/1)
1935: EM 0.00% (0/1)
1955: EM 0.00% (0/1)
1992: EM 0.00% (0/1)
1999: EM 0.00% (0/1)
2000: EM 0.00% (0/3)
2005: EM 0.00% (0/1)
2015: EM 0.00% (0/1)
ABC: EM 0.00% (0/1)
ARPNET: EM 0.00% (0/1)
BSkyB: EM 0.00% (0/2)
Ctenophora: EM 0.00% (0/1)
DEC: EM 0.00% (0/1)
DECnet: EM 0.00% (0/1)
NTL: EM 0.00% (0/1)
Ploughshares: EM 0.00% (0/1)
RP,: EM 0.00% (0/1)
SR: EM 0.00% (0/1)
Sentanta: EM 0.00% (0/1)
Sky: EM 0.00% (0/1)
UserDatagram: EM 0.00% (0/1)
V&A: EM 0.00% (0/1)
WG: EM 0.00% (0/1)
Y: EM 0.00% (0/1)
n: EM 0.00% (0/1)
АХШ-ка: EM 0.00% (0/1)
Аан: EM 0.00% (0/1)
Аарон: EM 0.00% (0/1)
Адаптивнай: EM 0.00% (0/1)
Амазонка: EM 0.00% (0/5)
Америкаҕа: EM 0.00% (0/1)
Анархистар: EM 0.00% (0/1)
Англия: EM 0.00% (0/1)
Аюрбарвада: EM 0.00% (0/1)
Багдадка: 




In [5]:
print("\nAll Question/Answer Pairs:")
for item, pred in zip(original_dataset, predictions):
    print(f"Question: {item['question']}")
    print(f"True Answer: {item['answer']}")
    print(f"Predicted Answer: {pred}")
    print("-" * 80)



All Question/Answer Pairs:
Question: Монреаль хаһан ылыллыбытай?
True Answer: 1760
Predicted Answer: Түҥэ тулкууну үөскүүттэр киирэллэр. Дьоҕо бэйэлэрэ сэрии бары
--------------------------------------------------------------------------------
Question: Дойду тугунан биллэрий?
True Answer: орто уонна уһун дистанциялаах чэпчэки атлетикаҕа баһыйар оруола
Predicted Answer: Лисе быһаарар сайдарын үксүүрээх?
Помпел: Уйаан-Тумускаан дойду сайдарын �
--------------------------------------------------------------------------------
Question: Ханнык хамаанда МЛС-тан уһуллубутай?
True Answer: Чивас
Predicted Answer: Лос-Анджелестээх Сидней финалын килдьэтэ уонна Кубок эйдэҕэр атын үрдэҕэ ЛАЛИг
--------------------------------------------------------------------------------
Question: 2005 сылтан ыла, Доктор Ким сүрүн айанныыр киһитэ хайдаҕый?
True Answer: дьахтар
Predicted Answer: Стивен Моффат Докторун айанныырын түмүллээһин үүнээн, үчүйээхэ "Тех
-------------------------------------------------