In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict

# 1. Text Processing Functions
def reconstruct_yakut_text(text):
    """Reconstruct Yakut text with proper word boundaries"""
    # Add spaces around special tokens
    text = re.sub(r'(<\|[^>]+\|>)', r' \1 ', text)

    # Handle common Yakut word boundaries
    yakut_words = ["куоска", "ардах", "сылаас", "кыыл", "элбэх", "утуйарый",
                  "былыт", "сиргэ", "түһэрий", "инчэҕэй", "күн", "аайы"]

    for word in sorted(yakut_words, key=len, reverse=True):
        text = re.sub(re.escape(word), r' \g<0> ', text, flags=re.IGNORECASE)

    # Clean up spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def normalize_answer(text):
    """Normalize text for answer comparison"""
    text = text.lower()
    text = re.sub(r'[.,!?;:]', '', text)
    return text.strip()

# 2. Model Generation Function
def generate_answer(model, tokenizer, context, question, options):
    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Таллар:\n"
        f"A) {options[0]}\n"
        f"B) {options[1]}\n"
        f"C) {options[2]}\n"
        f"D) {options[3]}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            temperature=0.9,
            num_beams=3,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Extract just the generated answer
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = full_text.find("Эппиэт:") + len("Эппиэт:")
    answer_text = full_text[answer_start:].split('\n')[0].strip()

    # Match generated answer to options
    normalized_options = {normalize_answer(opt): opt for opt in options}
    normalized_gen = normalize_answer(answer_text)

    # Find best matching option
    best_match = None
    for norm_opt, orig_opt in normalized_options.items():
        if norm_opt in normalized_gen or normalized_gen in norm_opt:
            best_match = orig_opt
            break

    return best_match if best_match else answer_text

# 3. Evaluation Functions
def evaluate_predictions(dataset, predictions):
    results = {
        'total': len(dataset),
        'correct': 0,
        'accuracy': 0,
        'per_question_type': defaultdict(lambda: {'correct': 0, 'total': 0}),
        'confusion_matrix': defaultdict(lambda: defaultdict(int))
    }

    for item, pred in zip(dataset, predictions):
        question_type = item['question'].split()[0]
        results['per_question_type'][question_type]['total'] += 1

        if pred == item['answer']:
            results['correct'] += 1
            results['per_question_type'][question_type]['correct'] += 1

        results['confusion_matrix'][item['answer']][pred] += 1

    results['accuracy'] = results['correct'] / results['total']

    # Calculate accuracy per question type
    for q_type in results['per_question_type']:
        q_stats = results['per_question_type'][q_type]
        q_stats['accuracy'] = q_stats['correct'] / q_stats['total'] if q_stats['total'] > 0 else 0

    return results

# 4. Main Evaluation Pipeline
def evaluate_model_on_dataset(model, tokenizer, dataset_path):
    # Load dataset
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Generate predictions
    predictions = []
    for item in tqdm(dataset, desc="Evaluating"):
        try:
            pred = generate_answer(
                model, tokenizer,
                item['context'],
                item['question'],
                item['options']
            )
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing item {item['id']}: {str(e)}")
            predictions.append(None)


    evaluated_pairs = []
    for i, (item, pred) in enumerate(zip(dataset, predictions)):
        if pred is not None:
            evaluated_pairs.append((item, pred))

    # Separate dataset items and predictions for the evaluation function
    evaluated_dataset = [d for d, p in evaluated_pairs]
    evaluated_predictions = [p for d, p in evaluated_pairs]

    evaluation_results = evaluate_predictions(evaluated_dataset, evaluated_predictions)

    # Return the original dataset along with results and predictions
    return evaluation_results, predictions, dataset

# 5. Run Evaluation
if __name__ == "__main__":
    # Load model
    model_path = "/content/drive/MyDrive/yakut-qa-finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

    # Run evaluation
    dataset_path = "/content/drive/MyDrive/Phase4_train_data/synthetic_dataset_yakut.json"
    results, predictions, original_dataset = evaluate_model_on_dataset(model, tokenizer, dataset_path)

    # Print results
    print("\nEvaluation Results:")
    print(f"Total Questions: {results['total']}")
    print(f"Correct Answers: {results['correct']}")
    print(f"Accuracy: {results['accuracy']:.2%}")

    print("\nAccuracy by Question Type:")
    for q_type, stats in results['per_question_type'].items():
        print(f"{q_type}: {stats['accuracy']:.2%} ({stats['correct']}/{stats['total']})")

    # Save detailed results
    output = {
        'model': model_path,
        'dataset': dataset_path,
        'overall_accuracy': results['accuracy'],
        'per_question_type': results['per_question_type'],
        'confusion_matrix': results['confusion_matrix'],
        'predictions': [
            {'id': item['id'],
             'question': item['question'],
             'predicted': pred,
             'correct': item['answer']}
            for item, pred in zip(original_dataset, predictions)
        ]
    }

    with open('evaluation_results.json', 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("\nSample Predictions:")
    # Use original_dataset here as well
    for i, (item, pred) in enumerate(zip(original_dataset[:5], predictions[:5])):
        print(f"\nQuestion {item['id']}: {item['question']}")
        print(f"Options: {', '.join(item['options'])}")
        print(f"Predicted: {pred}")
        print(f"Correct: {item['answer']}")
        print(f"Result: {'✓' if pred == item['answer'] else '✗'}")

Evaluating: 100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


Evaluation Results:
Total Questions: 100
Correct Answers: 9
Accuracy: 9.00%

Accuracy by Question Type:
Ханнык: 10.00% (2/20)
Былыттан: 0.00% (0/1)
Тимир: 0.00% (0/1)
Күнүс: 0.00% (0/1)
Кустар: 0.00% (0/1)
Ынахтан: 0.00% (0/1)
Тигээйилэр: 0.00% (0/1)
Туох: 0.00% (0/9)
Күһүн: 0.00% (0/1)
Төгүрүк: 100.00% (1/1)
Атаххын: 0.00% (0/1)
Муус: 0.00% (0/1)
Үүттэн: 0.00% (0/2)
Туохха: 0.00% (0/2)
Бириэмэни: 0.00% (0/1)
Суолга: 0.00% (0/2)
Суруйарга: 100.00% (1/1)
Халлаан: 0.00% (0/1)
Үрдүк: 0.00% (0/1)
Илиини: 100.00% (1/1)
Түүн: 0.00% (0/1)
Ороҥҥо: 0.00% (0/1)
Туустаах: 0.00% (0/1)
Тыалга: 0.00% (0/1)
Оскуолаҕа: 0.00% (0/1)
Итии: 100.00% (1/1)
Садтарга: 0.00% (0/1)
Эн: 0.00% (0/1)
Атах: 0.00% (0/1)
Куоракка: 0.00% (0/1)
Сылыттахха: 0.00% (0/1)
Дьиэ: 0.00% (0/1)
Төбөҕө: 0.00% (0/1)
Кууруссалар: 0.00% (0/1)
Кыһын: 0.00% (0/1)
Эйигин: 0.00% (0/1)
Минньигэс: 0.00% (0/1)
Аһы: 0.00% (0/1)
Дьон: 0.00% (0/1)
Сытыытык: 0.00% (0/1)
Ардах: 0.00% (0/2)
Умайдаҕына: 0.00% (0/1)
Халлааҥҥа: 0.00% (0/2)
Мас: 0




In [15]:


import itertools

def generate_answer_with_params(model, tokenizer, context, question, options, temperature, num_beams):
    """Generates an answer with specific temperature and num_beams settings."""
    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Таллар:\n"
        f"A) {options[0]}\n"
        f"B) {options[1]}\n"
        f"C) {options[2]}\n"
        f"D) {options[3]}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,
            temperature=temperature,
            num_beams=num_beams,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Extract just the generated answer
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = full_text.find("Эппиэт:") + len("Эппиэт:")
    answer_text = full_text[answer_start:].split('\n')[0].strip()

    # Match generated answer to options
    normalized_options = {normalize_answer(opt): opt for opt in options}
    normalized_gen = normalize_answer(answer_text)

    # Find best matching option
    best_match = None
    for norm_opt, orig_opt in normalized_options.items():
        if norm_opt in normalized_gen or normalized_gen in norm_opt:
            best_match = orig_opt
            break

    return best_match if best_match else answer_text

def evaluate_model_with_params(model, tokenizer, dataset, temperature, num_beams):
    """Evaluates the model on the dataset with specified temperature and num_beams."""
    predictions = []
    # Use the provided dataset directly
    for item in tqdm(dataset, desc=f"Evaluating (Temp: {temperature}, Beams: {num_beams})"):
        try:
            pred = generate_answer_with_params(
                model, tokenizer,
                item['context'],
                item['question'],
                item['options'],
                temperature=temperature,
                num_beams=num_beams
            )
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing item {item['id']} with params (T={temperature}, B={num_beams}): {str(e)}")
            predictions.append(None)

    # Evaluate results
    evaluated_pairs = []
    for i, (item, pred) in enumerate(zip(dataset, predictions)):
        if pred is not None:
            evaluated_pairs.append((item, pred))

    evaluated_dataset = [d for d, p in evaluated_pairs]
    evaluated_predictions = [p for d, p in evaluated_pairs]

    evaluation_results = evaluate_predictions(evaluated_dataset, evaluated_predictions)

    return evaluation_results['accuracy']

# 6. Hyperparameter Tuning Script
if __name__ == "__main__":
    model_path = "/content/drive/MyDrive/yakut-qa-finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

    dataset_path = "/content/drive/MyDrive/Phase4_train_data/synthetic_dataset_yakut.json"
    with open(dataset_path, 'r', encoding='utf-8') as f:
        tuning_dataset = json.load(f)

    # Define parameter ranges
    temperature_range = [round(i * 0.1, 1) for i in range(1, 11)] # 0.1 to 1.0
    num_beams_range = list(range(1, 21)) # 1 to 20

    best_accuracy = -1
    best_params = {}

    # Iterate through all combinations
    print("Starting hyperparameter tuning...")
    for temp, beams in itertools.product(temperature_range, num_beams_range):
        print(f"\nTesting parameters: Temperature={temp}, Num Beams={beams}")
        accuracy = evaluate_model_with_params(model, tokenizer, tuning_dataset, temp, beams)

        print(f"Resulting Accuracy: {accuracy:.2%}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {'temperature': temp, 'num_beams': beams}
            print(f"New Best Accuracy found: {best_accuracy:.2%} with params: {best_params}")

    print("\n--- Tuning Complete ---")
    print(f"Best Parameters: {best_params}")
    print(f"Highest Accuracy Achieved: {best_accuracy:.2%}")



Starting hyperparameter tuning...

Testing parameters: Temperature=0.1, Num Beams=1


Evaluating (Temp: 0.1, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.1, Beams: 1):   1%|          | 1/100 [00:00<00:16,  5.92it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.1, Beams: 1):   2%|▏         | 2/100 [00:00<00:13,  7.42it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.1, Beams: 1):   3%|▎         | 3/100 [00:00<00:12,  7.91it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.1, Beams: 1):   4%|▍         | 4/100 [00:00<00:11,  8.15it/s]The following generation flags are not valid and may be

Resulting Accuracy: 6.00%
New Best Accuracy found: 6.00% with params: {'temperature': 0.1, 'num_beams': 1}

Testing parameters: Temperature=0.1, Num Beams=2


Evaluating (Temp: 0.1, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.1, Num Beams=3


Evaluating (Temp: 0.1, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.48it/s]


Resulting Accuracy: 7.00%
New Best Accuracy found: 7.00% with params: {'temperature': 0.1, 'num_beams': 3}

Testing parameters: Temperature=0.1, Num Beams=4


Evaluating (Temp: 0.1, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.77it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=5


Evaluating (Temp: 0.1, Beams: 5): 100%|██████████| 100/100 [00:18<00:00,  5.27it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=6


Evaluating (Temp: 0.1, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.87it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=7


Evaluating (Temp: 0.1, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=8


Evaluating (Temp: 0.1, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=9


Evaluating (Temp: 0.1, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=10


Evaluating (Temp: 0.1, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=11


Evaluating (Temp: 0.1, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=12


Evaluating (Temp: 0.1, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=13


Evaluating (Temp: 0.1, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=14


Evaluating (Temp: 0.1, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=15


Evaluating (Temp: 0.1, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=16


Evaluating (Temp: 0.1, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=17


Evaluating (Temp: 0.1, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.56it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=18


Evaluating (Temp: 0.1, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=19


Evaluating (Temp: 0.1, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.1, Num Beams=20


Evaluating (Temp: 0.1, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.37it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=1


Evaluating (Temp: 0.2, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.2, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.12it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.2, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  9.16it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.2, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  9.15it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.2, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.16it/s]The following generation flags are not valid and may be

Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.2, Num Beams=2


Evaluating (Temp: 0.2, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.2, Num Beams=3


Evaluating (Temp: 0.2, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.50it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=4


Evaluating (Temp: 0.2, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.77it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=5


Evaluating (Temp: 0.2, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.26it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=6


Evaluating (Temp: 0.2, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=7


Evaluating (Temp: 0.2, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=8


Evaluating (Temp: 0.2, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=9


Evaluating (Temp: 0.2, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=10


Evaluating (Temp: 0.2, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=11


Evaluating (Temp: 0.2, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=12


Evaluating (Temp: 0.2, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=13


Evaluating (Temp: 0.2, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=14


Evaluating (Temp: 0.2, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.89it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=15


Evaluating (Temp: 0.2, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=16


Evaluating (Temp: 0.2, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=17


Evaluating (Temp: 0.2, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=18


Evaluating (Temp: 0.2, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=19


Evaluating (Temp: 0.2, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.2, Num Beams=20


Evaluating (Temp: 0.2, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.37it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.3, Num Beams=1


Evaluating (Temp: 0.3, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.3, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.07it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.3, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  9.07it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.3, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  9.09it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.3, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.11it/s]The following generation flags are not valid and may be

Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.3, Num Beams=2


Evaluating (Temp: 0.3, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.36it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.3, Num Beams=3


Evaluating (Temp: 0.3, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.50it/s]


Resulting Accuracy: 8.00%
New Best Accuracy found: 8.00% with params: {'temperature': 0.3, 'num_beams': 3}

Testing parameters: Temperature=0.3, Num Beams=4


Evaluating (Temp: 0.3, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.77it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=5


Evaluating (Temp: 0.3, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.26it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=6


Evaluating (Temp: 0.3, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=7


Evaluating (Temp: 0.3, Beams: 7): 100%|██████████| 100/100 [00:22<00:00,  4.54it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=8


Evaluating (Temp: 0.3, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=9


Evaluating (Temp: 0.3, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=10


Evaluating (Temp: 0.3, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=11


Evaluating (Temp: 0.3, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=12


Evaluating (Temp: 0.3, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=13


Evaluating (Temp: 0.3, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=14


Evaluating (Temp: 0.3, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=15


Evaluating (Temp: 0.3, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=16


Evaluating (Temp: 0.3, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=17


Evaluating (Temp: 0.3, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=18


Evaluating (Temp: 0.3, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=19


Evaluating (Temp: 0.3, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.3, Num Beams=20


Evaluating (Temp: 0.3, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=1


Evaluating (Temp: 0.4, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.4, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.12it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.4, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  9.16it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.4, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  9.12it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.4, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.12it/s]The following generation flags are not valid and may be

Resulting Accuracy: 4.00%

Testing parameters: Temperature=0.4, Num Beams=2


Evaluating (Temp: 0.4, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.4, Num Beams=3


Evaluating (Temp: 0.4, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=4


Evaluating (Temp: 0.4, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.78it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=5


Evaluating (Temp: 0.4, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=6


Evaluating (Temp: 0.4, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=7


Evaluating (Temp: 0.4, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=8


Evaluating (Temp: 0.4, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=9


Evaluating (Temp: 0.4, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=10


Evaluating (Temp: 0.4, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=11


Evaluating (Temp: 0.4, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=12


Evaluating (Temp: 0.4, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=13


Evaluating (Temp: 0.4, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=14


Evaluating (Temp: 0.4, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=15


Evaluating (Temp: 0.4, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=16


Evaluating (Temp: 0.4, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=17


Evaluating (Temp: 0.4, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.56it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=18


Evaluating (Temp: 0.4, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=19


Evaluating (Temp: 0.4, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.4, Num Beams=20


Evaluating (Temp: 0.4, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=1


Evaluating (Temp: 0.5, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.5, Beams: 1):   1%|          | 1/100 [00:00<00:11,  8.58it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.5, Beams: 1):   2%|▏         | 2/100 [00:00<00:11,  8.85it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.5, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  8.94it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.5, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.03it/s]The following generation flags are not valid and may be

Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.5, Num Beams=2


Evaluating (Temp: 0.5, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.34it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.5, Num Beams=3


Evaluating (Temp: 0.5, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=4


Evaluating (Temp: 0.5, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.78it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=5


Evaluating (Temp: 0.5, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=6


Evaluating (Temp: 0.5, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=7


Evaluating (Temp: 0.5, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=8


Evaluating (Temp: 0.5, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=9


Evaluating (Temp: 0.5, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=10


Evaluating (Temp: 0.5, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=11


Evaluating (Temp: 0.5, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=12


Evaluating (Temp: 0.5, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=13


Evaluating (Temp: 0.5, Beams: 13): 100%|██████████| 100/100 [00:32<00:00,  3.03it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=14


Evaluating (Temp: 0.5, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=15


Evaluating (Temp: 0.5, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=16


Evaluating (Temp: 0.5, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.64it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=17


Evaluating (Temp: 0.5, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.56it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=18


Evaluating (Temp: 0.5, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=19


Evaluating (Temp: 0.5, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.5, Num Beams=20


Evaluating (Temp: 0.5, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.6, Num Beams=1


Evaluating (Temp: 0.6, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.6, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.11it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.6, Beams: 1):   2%|▏         | 2/100 [00:00<00:11,  8.88it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.6, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  8.99it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.6, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  8.99it/s]The following generation flags are not valid and may be

Resulting Accuracy: 5.00%

Testing parameters: Temperature=0.6, Num Beams=2


Evaluating (Temp: 0.6, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=0.6, Num Beams=3


Evaluating (Temp: 0.6, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Resulting Accuracy: 10.00%
New Best Accuracy found: 10.00% with params: {'temperature': 0.6, 'num_beams': 3}

Testing parameters: Temperature=0.6, Num Beams=4


Evaluating (Temp: 0.6, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.76it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=5


Evaluating (Temp: 0.6, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=6


Evaluating (Temp: 0.6, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=7


Evaluating (Temp: 0.6, Beams: 7): 100%|██████████| 100/100 [00:22<00:00,  4.54it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=8


Evaluating (Temp: 0.6, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=9


Evaluating (Temp: 0.6, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.85it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=10


Evaluating (Temp: 0.6, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=11


Evaluating (Temp: 0.6, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=12


Evaluating (Temp: 0.6, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=13


Evaluating (Temp: 0.6, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.02it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=14


Evaluating (Temp: 0.6, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.86it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=15


Evaluating (Temp: 0.6, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=16


Evaluating (Temp: 0.6, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=17


Evaluating (Temp: 0.6, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.56it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=18


Evaluating (Temp: 0.6, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=19


Evaluating (Temp: 0.6, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.6, Num Beams=20


Evaluating (Temp: 0.6, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=1


Evaluating (Temp: 0.7, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.7, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.10it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.7, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  9.05it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.7, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  8.96it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.7, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.01it/s]The following generation flags are not valid and may be

Resulting Accuracy: 2.00%

Testing parameters: Temperature=0.7, Num Beams=2


Evaluating (Temp: 0.7, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.36it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.7, Num Beams=3


Evaluating (Temp: 0.7, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.7, Num Beams=4


Evaluating (Temp: 0.7, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.77it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=5


Evaluating (Temp: 0.7, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.7, Num Beams=6


Evaluating (Temp: 0.7, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=7


Evaluating (Temp: 0.7, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=8


Evaluating (Temp: 0.7, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=9


Evaluating (Temp: 0.7, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.86it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=10


Evaluating (Temp: 0.7, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=11


Evaluating (Temp: 0.7, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=12


Evaluating (Temp: 0.7, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=13


Evaluating (Temp: 0.7, Beams: 13): 100%|██████████| 100/100 [00:32<00:00,  3.03it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=14


Evaluating (Temp: 0.7, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=15


Evaluating (Temp: 0.7, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=16


Evaluating (Temp: 0.7, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=17


Evaluating (Temp: 0.7, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=18


Evaluating (Temp: 0.7, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=19


Evaluating (Temp: 0.7, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.7, Num Beams=20


Evaluating (Temp: 0.7, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=1


Evaluating (Temp: 0.8, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.8, Beams: 1):   1%|          | 1/100 [00:00<00:11,  8.97it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.8, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  8.99it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.8, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  9.02it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.8, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  8.96it/s]The following generation flags are not valid and may be

Resulting Accuracy: 4.00%

Testing parameters: Temperature=0.8, Num Beams=2


Evaluating (Temp: 0.8, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.34it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=0.8, Num Beams=3


Evaluating (Temp: 0.8, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=4


Evaluating (Temp: 0.8, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.74it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=5


Evaluating (Temp: 0.8, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.24it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=6


Evaluating (Temp: 0.8, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Resulting Accuracy: 10.00%

Testing parameters: Temperature=0.8, Num Beams=7


Evaluating (Temp: 0.8, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=8


Evaluating (Temp: 0.8, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.18it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.8, Num Beams=9


Evaluating (Temp: 0.8, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.85it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=10


Evaluating (Temp: 0.8, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.61it/s]


Resulting Accuracy: 10.00%

Testing parameters: Temperature=0.8, Num Beams=11


Evaluating (Temp: 0.8, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.42it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=12


Evaluating (Temp: 0.8, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=13


Evaluating (Temp: 0.8, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.02it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=14


Evaluating (Temp: 0.8, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=15


Evaluating (Temp: 0.8, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.73it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=16


Evaluating (Temp: 0.8, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=17


Evaluating (Temp: 0.8, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=18


Evaluating (Temp: 0.8, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=19


Evaluating (Temp: 0.8, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.8, Num Beams=20


Evaluating (Temp: 0.8, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=1


Evaluating (Temp: 0.9, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.9, Beams: 1):   1%|          | 1/100 [00:00<00:10,  9.04it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.9, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  9.10it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.9, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  9.11it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 0.9, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  9.12it/s]The following generation flags are not valid and may be

Resulting Accuracy: 5.00%

Testing parameters: Temperature=0.9, Num Beams=2


Evaluating (Temp: 0.9, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.33it/s]


Resulting Accuracy: 3.00%

Testing parameters: Temperature=0.9, Num Beams=3


Evaluating (Temp: 0.9, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.50it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=0.9, Num Beams=4


Evaluating (Temp: 0.9, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.75it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=5


Evaluating (Temp: 0.9, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.24it/s]


Resulting Accuracy: 11.00%
New Best Accuracy found: 11.00% with params: {'temperature': 0.9, 'num_beams': 5}

Testing parameters: Temperature=0.9, Num Beams=6


Evaluating (Temp: 0.9, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.83it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=7


Evaluating (Temp: 0.9, Beams: 7): 100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=8


Evaluating (Temp: 0.9, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.20it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=9


Evaluating (Temp: 0.9, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=10


Evaluating (Temp: 0.9, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.63it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=11


Evaluating (Temp: 0.9, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=12


Evaluating (Temp: 0.9, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=13


Evaluating (Temp: 0.9, Beams: 13): 100%|██████████| 100/100 [00:32<00:00,  3.03it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=14


Evaluating (Temp: 0.9, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=15


Evaluating (Temp: 0.9, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.73it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=16


Evaluating (Temp: 0.9, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=17


Evaluating (Temp: 0.9, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=18


Evaluating (Temp: 0.9, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=19


Evaluating (Temp: 0.9, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=0.9, Num Beams=20


Evaluating (Temp: 0.9, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=1


Evaluating (Temp: 1.0, Beams: 1):   0%|          | 0/100 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 1.0, Beams: 1):   1%|          | 1/100 [00:00<00:11,  8.93it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 1.0, Beams: 1):   2%|▏         | 2/100 [00:00<00:10,  8.97it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 1.0, Beams: 1):   3%|▎         | 3/100 [00:00<00:10,  8.96it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating (Temp: 1.0, Beams: 1):   4%|▍         | 4/100 [00:00<00:10,  8.97it/s]The following generation flags are not valid and may be

Resulting Accuracy: 2.00%

Testing parameters: Temperature=1.0, Num Beams=2


Evaluating (Temp: 1.0, Beams: 2): 100%|██████████| 100/100 [00:13<00:00,  7.32it/s]


Resulting Accuracy: 6.00%

Testing parameters: Temperature=1.0, Num Beams=3


Evaluating (Temp: 1.0, Beams: 3): 100%|██████████| 100/100 [00:15<00:00,  6.49it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=1.0, Num Beams=4


Evaluating (Temp: 1.0, Beams: 4): 100%|██████████| 100/100 [00:17<00:00,  5.77it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=1.0, Num Beams=5


Evaluating (Temp: 1.0, Beams: 5): 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]


Resulting Accuracy: 8.00%

Testing parameters: Temperature=1.0, Num Beams=6


Evaluating (Temp: 1.0, Beams: 6): 100%|██████████| 100/100 [00:20<00:00,  4.84it/s]


Resulting Accuracy: 7.00%

Testing parameters: Temperature=1.0, Num Beams=7


Evaluating (Temp: 1.0, Beams: 7): 100%|██████████| 100/100 [00:21<00:00,  4.55it/s]


Resulting Accuracy: 10.00%

Testing parameters: Temperature=1.0, Num Beams=8


Evaluating (Temp: 1.0, Beams: 8): 100%|██████████| 100/100 [00:23<00:00,  4.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=9


Evaluating (Temp: 1.0, Beams: 9): 100%|██████████| 100/100 [00:25<00:00,  3.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=10


Evaluating (Temp: 1.0, Beams: 10): 100%|██████████| 100/100 [00:27<00:00,  3.62it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=11


Evaluating (Temp: 1.0, Beams: 11): 100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=12


Evaluating (Temp: 1.0, Beams: 12): 100%|██████████| 100/100 [00:31<00:00,  3.21it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=13


Evaluating (Temp: 1.0, Beams: 13): 100%|██████████| 100/100 [00:33<00:00,  3.03it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=14


Evaluating (Temp: 1.0, Beams: 14): 100%|██████████| 100/100 [00:34<00:00,  2.87it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=15


Evaluating (Temp: 1.0, Beams: 15): 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=16


Evaluating (Temp: 1.0, Beams: 16): 100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=17


Evaluating (Temp: 1.0, Beams: 17): 100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=18


Evaluating (Temp: 1.0, Beams: 18): 100%|██████████| 100/100 [00:39<00:00,  2.51it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=19


Evaluating (Temp: 1.0, Beams: 19): 100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


Resulting Accuracy: 9.00%

Testing parameters: Temperature=1.0, Num Beams=20


Evaluating (Temp: 1.0, Beams: 20): 100%|██████████| 100/100 [00:42<00:00,  2.36it/s]

Resulting Accuracy: 9.00%

--- Tuning Complete ---
Best Parameters: {'temperature': 0.9, 'num_beams': 5}
Highest Accuracy Achieved: 11.00%





In [12]:


print("\n--- Detailed Predictions vs. True Answers ---")
for item, pred in zip(original_dataset, predictions):
    print(f"ID: {item['id']}")
    print(f"  Question: {item['question']}")
    print(f"  True Answer: {item['answer']}")
    print(f"  Generated Answer: {pred}")
    print(f"  Result: {'CORRECT' if item['answer'] == pred else 'INCORRECT'}")
    print("-" * 20)


--- Detailed Predictions vs. True Answers ---
ID: 1
  Question: Ханнык кыыл сылаас сиргэ элбэхтик утуйарый?
  True Answer: Куоска
  Generated Answer: Иммуннайсистиэмэ
  Result: INCORRECT
--------------------
ID: 2
  Question: Былыттан туох түһэрий?
  True Answer: Ардах
  Generated Answer: Джексонвилл
  Result: INCORRECT
--------------------
ID: 3
  Question: Тимир суолга туох айанныырый?
  True Answer: Поезд
  Generated Answer: Лос-Андже
  Result: INCORRECT
--------------------
ID: 4
  Question: Ханнык фрукта маска үүнэрий?
  True Answer: Дьаабылака
  Generated Answer: Ктенофордар
  Result: INCORRECT
--------------------
ID: 5
  Question: Күнүс сырдыгы туох биэрэрий?
  True Answer: Күн
  Generated Answer: Уот
  Result: INCORRECT
--------------------
ID: 6
  Question: Кустар ханна сымыыттыылларый?
  True Answer: Уйа
  Generated Answer: Ктенофордар
  Result: INCORRECT
--------------------
ID: 7
  Question: Ынахтан туох тахсарый?
  True Answer: Үүт
  Generated Answer: Соруйаан
  Result: 

In [24]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict

# 1. Text Processing Functions
def reconstruct_yakut_text(text):
    """Reconstruct Yakut text with proper word boundaries"""
    # Add spaces around special tokens
    text = re.sub(r'(<\|[^>]+\|>)', r' \1 ', text)

    # Handle common Yakut word boundaries
    yakut_words = ["куоска", "ардах", "сылаас", "кыыл", "элбэх", "утуйарый",
                  "былыт", "сиргэ", "түһэрий", "инчэҕэй", "күн", "аайы"]

    for word in sorted(yakut_words, key=len, reverse=True):
        text = re.sub(re.escape(word), r' \g<0> ', text, flags=re.IGNORECASE)

    # Clean up spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def normalize_answer(text):
    """Normalize text for answer comparison"""
    text = text.lower()
    text = re.sub(r'[.,!?;:]', '', text)
    return text.strip()

# 2. Model Generation Function
def generate_answer(model, tokenizer, context, question, options):
    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Таллар:\n"
        f"A) {options[0]}\n"
        f"B) {options[1]}\n"
        f"C) {options[2]}\n"
        f"D) {options[3]}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,
            temperature=0.9,
            num_beams=5,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Extract just the generated answer
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = full_text.find("Эппиэт:") + len("Эппиэт:")
    answer_text = full_text[answer_start:].split('\n')[0].strip()

    # Match generated answer to options
    normalized_options = {normalize_answer(opt): opt for opt in options}
    normalized_gen = normalize_answer(answer_text)

    # Find best matching option
    best_match = None
    for norm_opt, orig_opt in normalized_options.items():
        if norm_opt in normalized_gen or normalized_gen in norm_opt:
            best_match = orig_opt
            break

    return best_match if best_match else answer_text

# 3. Evaluation Functions
def evaluate_predictions(dataset, predictions):
    results = {
        'total': len(dataset),
        'correct': 0,
        'accuracy': 0,
        'per_question_type': defaultdict(lambda: {'correct': 0, 'total': 0}),
        'confusion_matrix': defaultdict(lambda: defaultdict(int))
    }

    for item, pred in zip(dataset, predictions):
        question_type = item['question'].split()[0]
        results['per_question_type'][question_type]['total'] += 1

        if pred == item['answer']:
            results['correct'] += 1
            results['per_question_type'][question_type]['correct'] += 1

        results['confusion_matrix'][item['answer']][pred] += 1

    results['accuracy'] = results['correct'] / results['total']

    # Calculate accuracy per question type
    for q_type in results['per_question_type']:
        q_stats = results['per_question_type'][q_type]
        q_stats['accuracy'] = q_stats['correct'] / q_stats['total'] if q_stats['total'] > 0 else 0

    return results

# 4. Main Evaluation Pipeline
def evaluate_model_on_dataset(model, tokenizer, dataset_path):
    # Load dataset
    with open(dataset_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Generate predictions
    predictions = []
    for item in tqdm(dataset, desc="Evaluating"):
        try:
            pred = generate_answer(
                model, tokenizer,
                item['context'],
                item['question'],
                item['options']
            )
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing item {item['id']}: {str(e)}")
            predictions.append(None)


    evaluated_pairs = []
    for i, (item, pred) in enumerate(zip(dataset, predictions)):
        if pred is not None:
            evaluated_pairs.append((item, pred))

    evaluated_dataset = [d for d, p in evaluated_pairs]
    evaluated_predictions = [p for d, p in evaluated_pairs]

    evaluation_results = evaluate_predictions(evaluated_dataset, evaluated_predictions)

    return evaluation_results, predictions, dataset

# 5. Run Evaluation
if __name__ == "__main__":
    # Load model
    model_path = "/content/drive/MyDrive/yakut-qa-finetuned"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path).to("cuda")

    # Run evaluation
    dataset_path = "/content/drive/MyDrive/Phase4_train_data/synthetic_dataset_yakut.json"
    results, predictions, original_dataset = evaluate_model_on_dataset(model, tokenizer, dataset_path)

    # Print results
    print("\nEvaluation Results:")
    print(f"Total Questions: {results['total']}")
    print(f"Correct Answers: {results['correct']}")
    print(f"Accuracy: {results['accuracy']:.2%}")

    print("\nAccuracy by Question Type:")
    for q_type, stats in results['per_question_type'].items():
        print(f"{q_type}: {stats['accuracy']:.2%} ({stats['correct']}/{stats['total']})")

    # Save detailed results
    output = {
        'model': model_path,
        'dataset': dataset_path,
        'overall_accuracy': results['accuracy'],
        'per_question_type': results['per_question_type'],
        'confusion_matrix': results['confusion_matrix'],
        'predictions': [
            # Use original_dataset here
            {'id': item['id'],
             'question': item['question'],
             'predicted': pred,
             'correct': item['answer']}
            for item, pred in zip(original_dataset, predictions)
        ]
    }

    with open('evaluation_results.json', 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("\nSample Predictions:")
    # Use original_dataset here as well
    for i, (item, pred) in enumerate(zip(original_dataset[:5], predictions[:5])):
        print(f"\nQuestion {item['id']}: {item['question']}")
        print(f"Options: {', '.join(item['options'])}")
        print(f"Predicted: {pred}")
        print(f"Correct: {item['answer']}")
        print(f"Result: {'✓' if pred == item['answer'] else '✗'}")

Evaluating: 100%|██████████| 100/100 [00:11<00:00,  8.56it/s]


Evaluation Results:
Total Questions: 100
Correct Answers: 13
Accuracy: 13.00%

Accuracy by Question Type:
Ханнык: 15.00% (3/20)
Былыттан: 0.00% (0/1)
Тимир: 0.00% (0/1)
Күнүс: 0.00% (0/1)
Кустар: 0.00% (0/1)
Ынахтан: 0.00% (0/1)
Тигээйилэр: 0.00% (0/1)
Туох: 0.00% (0/9)
Күһүн: 0.00% (0/1)
Төгүрүк: 0.00% (0/1)
Атаххын: 0.00% (0/1)
Муус: 0.00% (0/1)
Үүттэн: 0.00% (0/2)
Туохха: 0.00% (0/2)
Бириэмэни: 100.00% (1/1)
Суолга: 0.00% (0/2)
Суруйарга: 100.00% (1/1)
Халлаан: 0.00% (0/1)
Үрдүк: 0.00% (0/1)
Илиини: 0.00% (0/1)
Түүн: 0.00% (0/1)
Ороҥҥо: 0.00% (0/1)
Туустаах: 0.00% (0/1)
Тыалга: 0.00% (0/1)
Оскуолаҕа: 0.00% (0/1)
Итии: 100.00% (1/1)
Садтарга: 0.00% (0/1)
Эн: 0.00% (0/1)
Атах: 0.00% (0/1)
Куоракка: 0.00% (0/1)
Сылыттахха: 0.00% (0/1)
Дьиэ: 0.00% (0/1)
Төбөҕө: 100.00% (1/1)
Кууруссалар: 0.00% (0/1)
Кыһын: 0.00% (0/1)
Эйигин: 0.00% (0/1)
Минньигэс: 0.00% (0/1)
Аһы: 0.00% (0/1)
Дьон: 0.00% (0/1)
Сытыытык: 0.00% (0/1)
Ардах: 0.00% (0/2)
Умайдаҕына: 0.00% (0/1)
Халлааҥҥа: 0.00% (0/2)
Мас:




In [25]:
print("\n--- Detailed Predictions vs. True Answers ---")
for item, pred in zip(original_dataset, predictions):
    print(f"ID: {item['id']}")
    print(f"  Question: {item['question']}")
    print(f"  True Answer: {item['answer']}")
    print(f"  Generated Answer: {pred}")
    print(f"  Result: {'CORRECT' if item['answer'] == pred else 'INCORRECT'}")
    print("-" * 20)


--- Detailed Predictions vs. True Answers ---
ID: 1
  Question: Ханнык кыыл сылаас сиргэ элбэхтик утуйарый?
  True Answer: Куоска
  Generated Answer: Уэль
  Result: INCORRECT
--------------------
ID: 2
  Question: Былыттан туох түһэрий?
  True Answer: Ардах
  Generated Answer: Соруйа
  Result: INCORRECT
--------------------
ID: 3
  Question: Тимир суолга туох айанныырый?
  True Answer: Поезд
  Generated Answer: Соруйа
  Result: INCORRECT
--------------------
ID: 4
  Question: Ханнык фрукта маска үүнэрий?
  True Answer: Дьаабылака
  Generated Answer: Кте
  Result: INCORRECT
--------------------
ID: 5
  Question: Күнүс сырдыгы туох биэрэрий?
  True Answer: Күн
  Generated Answer: Пантер
  Result: INCORRECT
--------------------
ID: 6
  Question: Кустар ханна сымыыттыылларый?
  True Answer: Уйа
  Generated Answer: Кте
  Result: INCORRECT
--------------------
ID: 7
  Question: Ынахтан туох тахсарый?
  True Answer: Үүт
  Generated Answer: Соруйа
  Result: INCORRECT
--------------------
ID: 

In [6]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import defaultdict

# Configuration
MODEL_PATH = "meta-llama/Llama-3.2-1B"
DATASET_PATH = "/content/drive/MyDrive/Phase4_train_data/synthetic_dataset_yakut.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TORCH_DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
token = "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"

# 1. Text Processing Functions
def normalize_yakut_text(text):
    """Normalize Yakut text for comparison"""
    text = text.lower()
    text = re.sub(r'[.,!?;:]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# 2. Model Generation Function for Multiple Choice
def generate_mc_answer(model, tokenizer, context, question, options):
    options_text = "\n".join([f"{chr(65+i)}) {opt}" for i, opt in enumerate(options)])

    prompt = (
        f"<|begin_of_text|><sah>Контекст: {context}\n"
        f"Соруйаан: {question}\n"
        f"Таллар:\n{options_text}\n"
        f"Эппиэт:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", max_length=2048, truncation=True).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,  
            temperature=0.3,
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
        )

    # Extract just the generated answer
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer_start = full_text.find("Эппиэт:") + len("ЭппиЭт:")
    answer_text = full_text[answer_start:].split('\n')[0].strip()

    # Match to options (case insensitive)
    normalized_options = {normalize_yakut_text(opt): opt for opt in options}
    normalized_gen = normalize_yakut_text(answer_text)

    # Check for exact option match
    for norm_opt, orig_opt in normalized_options.items():
        if norm_opt == normalized_gen:
            return orig_opt

    # Check for option letter match (A, B, C, D)
    if len(answer_text) == 1 and answer_text.upper() in ['A', 'B', 'C', 'D']:
        idx = ord(answer_text.upper()) - ord('A')
        if idx < len(options):
            return options[idx]

    # Check for partial matches
    for norm_opt, orig_opt in normalized_options.items():
        if norm_opt in normalized_gen or normalized_gen in norm_opt:
            return orig_opt

    return answer_text  # Return raw answer if no match found

# 3. Evaluation Functions
def evaluate_predictions(dataset, predictions):
    results = {
        'total': len(dataset),
        'correct': 0,
        'accuracy': 0,
        'per_question_type': defaultdict(lambda: {'correct': 0, 'total': 0}),
        'confusion_matrix': defaultdict(lambda: defaultdict(int))
    }

    for item, pred in zip(dataset, predictions):
        question_type = item['question'].split()[0]  # First word as question type
        results['per_question_type'][question_type]['total'] += 1

        if normalize_yakut_text(pred) == normalize_yakut_text(item['answer']):
            results['correct'] += 1
            results['per_question_type'][question_type]['correct'] += 1

        # Track confusion between options
        results['confusion_matrix'][item['answer']][pred] += 1

    results['accuracy'] = results['correct'] / results['total']

    # Calculate accuracy per question type
    for q_type in results['per_question_type']:
        q_stats = results['per_question_type'][q_type]
        q_stats['accuracy'] = q_stats['correct'] / q_stats['total'] if q_stats['total'] > 0 else 0

    return results

# 4. Main Evaluation Pipeline
def evaluate_model():
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, token=token)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        token=token,
        torch_dtype=torch.float16
    ).to(DEVICE)

    # Load dataset
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    # Generate predictions
    predictions = []
    for item in tqdm(dataset, desc="Evaluating"):
        try:
            pred = generate_mc_answer(
                model, tokenizer,
                item['context'],
                item['question'],
                item['options']
            )
            predictions.append(pred)
        except Exception as e:
            print(f"Error processing item {item['id']}: {str(e)}")
            predictions.append("[ERROR]")

    # Evaluate results
    evaluation_results = evaluate_predictions(dataset, predictions)

    return evaluation_results, predictions, dataset 

# 5. Run and Report Results
if __name__ == "__main__":
    results, predictions, original_dataset = evaluate_model()

    print("\nEvaluation Results:")
    print(f"Model: {MODEL_PATH}")
    print(f"Device: {DEVICE}")
    print(f"Total Questions: {results['total']}")
    print(f"Correct Answers: {results['correct']}")
    print(f"Accuracy: {results['accuracy']:.2%}")

    print("\nAccuracy by Question Type:")
    for q_type, stats in sorted(results['per_question_type'].items()):
        print(f"{q_type}: {stats['accuracy']:.2%} ({stats['correct']}/{stats['total']})")

    # Save detailed results
    output = {
        'model': MODEL_PATH,
        'dataset': DATASET_PATH,
        'config': {
            'device': DEVICE,
            'dtype': str(TORCH_DTYPE)
        },
        'metrics': {
            'accuracy': results['accuracy'],
            'correct': results['correct'],
            'total': results['total']
        },
        'per_question_type': results['per_question_type'],
        'confusion_matrix': results['confusion_matrix'],
        'predictions': [
            {
                'id': item['id'],
                'question': item['question'],
                'context': item['context'],
                'options': item['options'],
                'predicted': pred,
                'true_answer': item['answer'],
                'is_correct': normalize_yakut_text(pred) == normalize_yakut_text(item['answer'])
            }
            for item, pred in zip(original_dataset, predictions) # Use original_dataset
        ]
    }

    with open('llama3_yakut_mc_results.json', 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("\nSample Predictions:")
    for i, (item, pred) in enumerate(zip(original_dataset[:3], predictions[:3])): # Use original_dataset
        print(f"\nQuestion {item['id']}: {item['question']}")
        print(f"Context: {item['context'][:100]}...")
        print(f"Options: {', '.join(item['options'])}")
        print(f"Predicted: {pred}")
        print(f"Correct: {item['answer']}")
        print(f"Result: {'✓' if normalize_yakut_text(pred) == normalize_yakut_text(item['answer']) else '✗'}")

    print("\nEvaluation complete. Detailed results saved to 'llama3_yakut_mc_results.json'")

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   1%|          | 1/100 [00:01<02:47,  1.69s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   2%|▏         | 2/100 [00:01<01:21,  1.20it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   3%|▎         | 3/100 [00:02<00:53,  1.81it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   4%|▍         | 4/100 [00:02<00:40,  2.37it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   5%|▌         | 5/100 [00:02<00:32,  2.88it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   6%|▌         | 6/100 [00:02<00:28,  3.31it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   7%|▋         | 7/100 [00:03<00:25,  3.65it/s]Setting `pad_token_id` to


Evaluation Results:
Model: meta-llama/Llama-3.2-1B
Device: cuda
Total Questions: 100
Correct Answers: 1
Accuracy: 1.00%

Accuracy by Question Type:
Айанныырга: 0.00% (0/1)
Ардах: 50.00% (1/2)
Ардахха: 0.00% (0/1)
Атах: 0.00% (0/1)
Атаххын: 0.00% (0/1)
Аһы: 0.00% (0/1)
Бириэмэни: 0.00% (0/1)
Былыттан: 0.00% (0/1)
Дьиэ: 0.00% (0/1)
Дьон: 0.00% (0/1)
Илиини: 0.00% (0/1)
Итии: 0.00% (0/1)
Куоракка: 0.00% (0/1)
Куорат: 0.00% (0/1)
Кустар: 0.00% (0/1)
Кууруссалар: 0.00% (0/1)
Кыраасканы: 0.00% (0/1)
Кыһын: 0.00% (0/1)
Күнүс: 0.00% (0/1)
Күһүн: 0.00% (0/1)
Лава: 0.00% (0/1)
Мас: 0.00% (0/1)
Минньигэс: 0.00% (0/1)
Муус: 0.00% (0/1)
Мүөтү: 0.00% (0/1)
Ороҥҥо: 0.00% (0/1)
Оскуола: 0.00% (0/1)
Оскуолаҕа: 0.00% (0/1)
Садтарга: 0.00% (0/1)
Сир: 0.00% (0/1)
Сиртэн: 0.00% (0/1)
Сомуогу: 0.00% (0/1)
Суолга: 0.00% (0/2)
Суруйарга: 0.00% (0/1)
Сурукка: 0.00% (0/1)
Сылыттахха: 0.00% (0/1)
Сытыытык: 0.00% (0/1)
Тигээйилэр: 0.00% (0/1)
Тиис: 0.00% (0/1)
Тиискин: 0.00% (0/1)
Тимир: 0.00% (0/1)
Туох: 0.00% 




In [7]:
print("\n--- Detailed Predictions vs. True Answers ---")
for item, pred in zip(original_dataset, predictions):
    print(f"ID: {item['id']}")
    print(f"  Question: {item['question']}")
    print(f"  True Answer: {item['answer']}")
    print(f"  Generated Answer: {pred}")
    print(f"  Result: {'CORRECT' if item['answer'] == pred else 'INCORRECT'}")
    print("-" * 20)


--- Detailed Predictions vs. True Answers ---
ID: 1
  Question: Ханнык кыыл сылаас сиргэ элбэхтик утуйарый?
  True Answer: Куоска
  Generated Answer: 1. Kuuqsaalár k
  Result: INCORRECT
--------------------
ID: 2
  Question: Былыттан туох түһэрий?
  True Answer: Ардах
  Generated Answer: Хаар
  Result: INCORRECT
--------------------
ID: 3
  Question: Тимир суолга туох айанныырый?
  True Answer: Поезд
  Generated Answer: Аныктыылаах буолу
  Result: INCORRECT
--------------------
ID: 4
  Question: Ханнык фрукта маска үүнэрий?
  True Answer: Дьаабылака
  Generated Answer: Аныг сырыттан аты
  Result: INCORRECT
--------------------
ID: 5
  Question: Күнүс сырдыгы туох биэрэрий?
  True Answer: Күн
  Generated Answer: 1. Kүн (к�
  Result: INCORRECT
--------------------
ID: 6
  Question: Кустар ханна сымыыттыылларый?
  True Answer: Уйа
  Generated Answer: 1. Аныгы кө
  Result: INCORRECT
--------------------
ID: 7
  Question: Ынахтан туох тахсарый?
  True Answer: Үүт
  Generated Answer: Кэрээн