In [13]:
# Step 1: Install necessary libraries
!pip install -q transformers datasets accelerate bitsandbytes torch evaluate rouge_score sentencepiece bert_score

In [25]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from datasets import load_dataset
from kaggle_secrets import UserSecretsClient
import pandas as pd
import random
import evaluate # Hugging Face's library for NLP evaluation
import warnings

# Suppress warnings to keep the output clean
warnings.filterwarnings("ignore")

In [None]:
# Step 2: Authenticate with Hugging Face
# This is required to download gated models like Llama 3
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGING_FACE_TOKEN")
except Exception as e:
    print("Could not retrieve Hugging Face token. Please ensure it is stored as a Kaggle secret named 'HUGGING_FACE_TOKEN'.")
    # You can manually paste your token here for local testing if needed:
    # hf_token = "YOUR_HF_TOKEN"
    hf_token = None

In [None]:
# Step 3: Define Model and Dataset Identifiers
model_ids = [
    "alpha-ai/LLAMA3-3B-Medical-COT",
    "vilm/vietcuna-3b-v2"
]
dataset_id = "tmnam20/ViMedAQA"
NUM_SAMPLES_TO_EVALUATE = 1

# Step 4: Load and Prepare the Dataset
try:
    dataset = load_dataset(dataset_id, split="train")
    print(f"Dataset loaded successfully! Total samples: {len(dataset)}")

    # Create a small, random, representative sample for evaluation
    random.seed(42) # for reproducibility
    random_indices = random.sample(range(len(dataset)), NUM_SAMPLES_TO_EVALUATE)
    eval_dataset = dataset.select(random_indices)

    print(f"Created a random evaluation set with {len(eval_dataset)} samples.")
except Exception as e:
    print(f"Failed to load the dataset. Error: {e}")
    eval_dataset = None

In [None]:
# --- UPDATED: Step 1 - Define Bilingual Prompt Engineering Strategies ---

def create_llama3_prompts(sample):
    """
    Creates a set of prompt variations for Llama 3 / ChatML format,
    with instructions in both English and Vietnamese.
    """
    context = sample['context']
    question = sample['question']

    prompts = {}

    # --- Strategy 1: Original (Role-playing, strict context) ---
    # English Instruction
    prompts["Llama3_RolePlay_Strict_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a helpful medical assistant. Based *only* on the context provided below, answer the question in Vietnamese.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # Vietnamese Instruction
    prompts["Llama3_RolePlay_Strict_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Bạn là một trợ lý y tế hữu ích. Dựa *chỉ* vào ngữ cảnh được cung cấp dưới đây, hãy trả lời câu hỏi bằng tiếng Việt.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""


    # --- Strategy 2: Direct and Simple ---
    # English Instruction
    prompts["Llama3_Direct_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Use the following context to answer the question.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # Vietnamese Instruction
    prompts["Llama3_Direct_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Sử dụng ngữ cảnh sau để trả lời câu hỏi.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # --- Strategy 3: Chain-of-Thought Style ---
    # English Instruction
    prompts["Llama3_CoT_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Read the context, think step-by-step, and then answer the user's question based only on the information in the context.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    
    # Vietnamese Instruction
    prompts["Llama3_CoT_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hãy đọc ngữ cảnh, suy nghĩ từng bước, và sau đó trả lời câu hỏi của người dùng chỉ dựa vào thông tin trong ngữ cảnh.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    
    return prompts

def create_vietcuna_prompts(sample):
    """
    Creates a set of prompt variations for the Vicuna format,
    with instructions in both English and Vietnamese.
    """
    context = sample['context']
    question = sample['question']
    
    prompts = {}

    # --- Strategy 1: Original (Direct, no explanation) ---
    # Vietnamese Instruction (Original)
    instruction_vi_1 = (
        "Dựa vào ngữ cảnh sau đây để trả lời câu hỏi. Chỉ trích xuất câu trả lời trực tiếp từ văn bản, không giải thích gì thêm.\n\n"
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_Direct_NoExplain_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_1}\nASSISTANT:"

    # English Instruction
    instruction_en_1 = (
        "Based on the following context, answer the question. Only extract the direct answer from the text, do not add any explanation.\n\n"
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_Direct_NoExplain_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_1}\nASSISTANT:"

    # --- Strategy 2: Role-playing ---
    # Vietnamese Instruction
    instruction_vi_2 = (
        "Bạn là một trợ lý y tế hữu ích. Dựa vào thông tin trong ngữ cảnh được cung cấp để trả lời câu hỏi của người dùng.\n\n"
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_RolePlay_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_2}\nASSISTANT:"

    # English Instruction
    instruction_en_2 = (
        "You are a helpful medical assistant. Based on the information in the provided context, answer the user's question.\n\n"
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_RolePlay_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_2}\nASSISTANT:"

    # --- Strategy 3: Simplified (Context + Question only) ---
    # Vietnamese Instruction
    instruction_vi_3 = (
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_Simplified_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_3}\nASSISTANT:"

    # English Instruction
    instruction_en_3 = (
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_Simplified_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_3}\nASSISTANT:"
    
    return prompts

In [None]:
# Step 5: Generate Answers from Each Model
all_generated_answers = {}

if eval_dataset and hf_token:
    # Wrap each answer in a list to create the required List[List[str]] structure
    ground_truth_answers = [[sample['answer']] for sample in eval_dataset] 
    questions = [sample['question'] for sample in eval_dataset]
    
    # Loop through each model to generate answers
    for model_id in model_ids:
        print("\n" + "="*50)
        print(f"Loading model: {model_id}")
        print("="*50)

        model, tokenizer, text_generator = None, None, None

        try:
            # Load the tokenizer and model with 4-bit quantization to save memory
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=False,
            )
            tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                token=hf_token,
                quantization_config=bnb_config, # <-- PASS THE CONFIG OBJECT HERE
                device_map="auto",
                trust_remote_code=True
            )

            # Set up the text generation pipeline
            text_generator = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )

            # Determine which set of prompt functions to use
            if 'vietcuna' in model_id:
                prompt_function = create_vietcuna_prompts
                answer_start_tag = "ASSISTANT:"
            else:
                prompt_function = create_llama3_prompts
                answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"

            # Create one set of prompts to test. We get the keys from the first sample.
            # This assumes all samples will generate the same prompt keys.
            prompt_variations = prompt_function(eval_dataset[0]).keys()

            for prompt_name in prompt_variations:
                print(f"\n--- Testing Prompt Strategy: {prompt_name} ---")
                
                # Generate all prompts for the current strategy
                prompts = [prompt_function(sample)[prompt_name] for sample in eval_dataset]

                print(f"Generating answers for {len(prompts)} prompts using {model_id} with '{prompt_name}' strategy...")
                # Generate answers for the entire batch
                generated_outputs_batch = text_generator(
                    prompts,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.1,
                    top_p=0.9,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.eos_token_id, # Set pad_token_id to avoid warnings
                )
    
                # Extract the clean answers
                model_answers = []
                if 'vietcuna' in model_id:
                    # Vietcuna doesn't have a special start tag, it just starts generating.
                    # The prompt ends with "ASSISTANT:", so we split on that.
                    answer_start_tag = "ASSISTANT:"
                else:
                    # Llama 3's start tag
                    answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"
                
                for i, output in enumerate(generated_outputs_batch):
                    generated_text = output[0]['generated_text']
                    # Use the appropriate start tag for splitting
                    if answer_start_tag in generated_text:
                        clean_answer = generated_text.split(answer_start_tag)[-1].strip()
                    else:
                        # Fallback remains the same
                        clean_answer = generated_text.replace(prompts[i], "").strip()
                    model_answers.append(clean_answer)
    
                # Validate that the number of generated answers matches the number of prompts
                if len(model_answers) != len(prompts):
                    print(f"  [ERROR] Mismatch in generation count for model '{model_id}'.")
                    print(f"  Expected {len(prompts)} answers, but got {len(model_answers)}.")
                    print("  This model will be skipped in the evaluation.")
                    # Use `continue` to immediately stop processing this model and move to the next one
                    continue 
    
                all_generated_answers[model_id] = model_answers
                print(f"Successfully generated answers for {model_id}.")

        except Exception as e:
            print(f"An error occurred while processing {model_id}: {e}")
        finally:
            # Check if variables were successfully created before deleting
            if model is not None: del model
            if tokenizer is not None: del tokenizer
            if text_generator is not None: del text_generator
            torch.cuda.empty_cache()

else:
    print("Skipping generation due to issues with the dataset or Hugging Face token.")

In [None]:
# Step 6: Evaluate the Generated Answers
if all_generated_answers:
    # Load all the metrics we need
    rouge_metric = evaluate.load('rouge')
    bleu_metric = evaluate.load('bleu')
    meteor_metric = evaluate.load('meteor')
    bertscore_metric = evaluate.load('bertscore')

    evaluation_results = []

    print("\n" + "="*50)
    print("Calculating Evaluation Metrics")
    print("="*50)

    for result_key, predictions in all_generated_answers.items():
        print(f"\n--- Evaluating {result_key} ---")
    
        # Check for empty predictions to prevent ZeroDivisionError in BLEU ---
        # The `any()` function returns False if all strings in the list are empty.
        if not any(predictions):
            print(f"  WARNING: Model & Prompt Strategy '{result_key}' produced empty answers for all samples. Assigning all metric scores to 0.")
            result_row = {
                "Model & Prompt Strategy": result_key,
                "ROUGE-L": 0.0,
                "BLEU": 0.0,
                "METEOR": 0.0,
                "BERTScore-F1": 0.0
            }
            evaluation_results.append(result_row)
            # Use `continue` to skip the rest of the loop and move to the next model
            continue
    
        # If predictions are valid, compute metrics as normal
        rouge_scores = rouge_metric.compute(predictions=predictions, references=ground_truth_answers)
        bleu_scores = bleu_metric.compute(predictions=predictions, references=ground_truth_answers)
        meteor_scores = meteor_metric.compute(predictions=predictions, references=ground_truth_answers)
        bertscore_scores = bertscore_metric.compute(predictions=predictions, references=ground_truth_answers, lang="vi")
    
        # Store results (this part is the same as before)
        result_row = {
            "Model & Prompt Strategy": result_key,
            "ROUGE-L": round(rouge_scores['rougeL'], 4),
            "BLEU": round(bleu_scores['bleu'], 4),
            "METEOR": round(meteor_scores['meteor'], 4),
            "BERTScore-F1": round(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']), 4)
        }
        evaluation_results.append(result_row)

    # Step 7: Display Results
    results_df = pd.DataFrame(evaluation_results)
    # Sort for better comparison
    results_df = results_df.sort_values(by="BERTScore-F1", ascending=False).reset_index(drop=True)
    print("\n--- Comparative Evaluation Results ---")
    display(results_df)

    # Display a few examples for manual inspection
    print("\n--- Example Generations ---")
    example_df_data = {
        "Question": questions[:3],
        "Ground Truth": ground_truth_answers[:3]
    }
    for model_id, answers in all_generated_answers.items():
        example_df_data[f"Answer: {model_id.split('/')[-1]}"] = answers[:3]

    example_df = pd.DataFrame(example_df_data)
    display(example_df)

else:
    print("\nNo answers were generated. Skipping evaluation.")