In [14]:
NUM_SAMPLES_TO_EVALUATE = 1
USE_BEST_PROMPT_ONLY = True

In [15]:
# Step 1: Install necessary libraries
!pip install -q transformers datasets accelerate bitsandbytes torch evaluate rouge_score sentencepiece bert_score

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from datasets import load_dataset
from kaggle_secrets import UserSecretsClient
import pandas as pd
import random
import evaluate # Hugging Face's library for NLP evaluation
import warnings

# Suppress warnings to keep the output clean
warnings.filterwarnings("ignore")

In [17]:
# Step 2: Authenticate with Hugging Face
# This is required to download gated models like Llama 3
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGING_FACE_TOKEN")
except Exception as e:
    print("Could not retrieve Hugging Face token. Please ensure it is stored as a Kaggle secret named 'HUGGING_FACE_TOKEN'.")
    # You can manually paste your token here for local testing if needed:
    # hf_token = "YOUR_HF_TOKEN"
    hf_token = None

In [18]:
# Step 3: Define Model and Dataset Identifiers
model_ids = [
    "alpha-ai/LLAMA3-3B-Medical-COT",
    "vilm/vietcuna-3b-v2",
    "arcee-ai/Arcee-VyLinh",
]
dataset_id = "tmnam20/ViMedAQA"

# Step 4: Load and Prepare the Dataset
try:
    dataset = load_dataset(dataset_id, split="train")
    print(f"Dataset loaded successfully! Total samples: {len(dataset)}")

    # Create a small, random, representative sample for evaluation
    random.seed(42) # for reproducibility
    random_indices = random.sample(range(len(dataset)), NUM_SAMPLES_TO_EVALUATE)
    eval_dataset = dataset.select(random_indices)

    print(f"Created a random evaluation set with {len(eval_dataset)} samples.")
except Exception as e:
    print(f"Failed to load the dataset. Error: {e}")
    eval_dataset = None

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/20.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/39881 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2217 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2215 [00:00<?, ? examples/s]

Dataset loaded successfully! Total samples: 39881
Created a random evaluation set with 1 samples.


In [19]:
# --- UPDATED: Step 1 - Define Bilingual Prompt Engineering Strategies ---
BEST_PROMPTS = {
    "alpha-ai/LLAMA3-3B-Medical-COT": "Llama3_Direct_VI",
    "vilm/vietcuna-3b-v2": "Vietcuna_Direct_NoExplain_VI",
    "arcee-ai/Arcee-VyLinh": "VyLinh_ChatTemplate_VI",
}

def create_llama3_prompts(sample, tokenizer=None):
    """
    Creates a set of prompt variations for Llama 3 / ChatML format,
    with instructions in both English and Vietnamese.
    """
    context = sample['context']
    question = sample['question']

    prompts = {}

    # --- Strategy 1: Original (Role-playing, strict context) ---
    # English Instruction
    prompts["Llama3_RolePlay_Strict_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

You are a helpful medical assistant. Based *only* on the context provided below, answer the question in Vietnamese.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # Vietnamese Instruction
    prompts["Llama3_RolePlay_Strict_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Bạn là một trợ lý y tế hữu ích. Dựa *chỉ* vào ngữ cảnh được cung cấp dưới đây, hãy trả lời câu hỏi bằng tiếng Việt.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""


    # --- Strategy 2: Direct and Simple ---
    # English Instruction
    prompts["Llama3_Direct_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Use the following context to answer the question.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # Vietnamese Instruction
    prompts["Llama3_Direct_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Sử dụng ngữ cảnh sau để trả lời câu hỏi.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

    # --- Strategy 3: Chain-of-Thought Style ---
    # English Instruction
    prompts["Llama3_CoT_EN"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Read the context, think step-by-step, and then answer the user's question based only on the information in the context.

Context: {context}

Question: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    
    # Vietnamese Instruction
    prompts["Llama3_CoT_VI"] = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hãy đọc ngữ cảnh, suy nghĩ từng bước, và sau đó trả lời câu hỏi của người dùng chỉ dựa vào thông tin trong ngữ cảnh.

Ngữ cảnh: {context}

Câu hỏi: {question}

<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    
    return prompts

def create_vietcuna_prompts(sample, tokenizer=None):
    """
    Creates a set of prompt variations for the Vicuna format,
    with instructions in both English and Vietnamese.
    """
    context = sample['context']
    question = sample['question']
    
    prompts = {}

    # --- Strategy 1: Original (Direct, no explanation) ---
    # Vietnamese Instruction (Original)
    instruction_vi_1 = (
        "Dựa vào ngữ cảnh sau đây để trả lời câu hỏi. Chỉ trích xuất câu trả lời trực tiếp từ văn bản, không giải thích gì thêm.\n\n"
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_Direct_NoExplain_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_1}\nASSISTANT:"

    # English Instruction
    instruction_en_1 = (
        "Based on the following context, answer the question. Only extract the direct answer from the text, do not add any explanation.\n\n"
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_Direct_NoExplain_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_1}\nASSISTANT:"

    # --- Strategy 2: Role-playing ---
    # Vietnamese Instruction
    instruction_vi_2 = (
        "Bạn là một trợ lý y tế hữu ích. Dựa vào thông tin trong ngữ cảnh được cung cấp để trả lời câu hỏi của người dùng.\n\n"
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_RolePlay_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_2}\nASSISTANT:"

    # English Instruction
    instruction_en_2 = (
        "You are a helpful medical assistant. Based on the information in the provided context, answer the user's question.\n\n"
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_RolePlay_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_2}\nASSISTANT:"

    # --- Strategy 3: Simplified (Context + Question only) ---
    # Vietnamese Instruction
    instruction_vi_3 = (
        f"Ngữ cảnh: {context}\n\n"
        f"Câu hỏi: {question}"
    )
    prompts["Vietcuna_Simplified_VI"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_vi_3}\nASSISTANT:"

    # English Instruction
    instruction_en_3 = (
        f"Context: {context}\n\n"
        f"Question: {question}"
    )
    prompts["Vietcuna_Simplified_EN"] = f"A chat between a curious user and an artificial intelligence assistant.\nUSER: {instruction_en_3}\nASSISTANT:"
    
    return prompts

def create_vylinh_prompts(sample, tokenizer):
    """
    Creates prompt variations for Arcee-VyLinh using the official tokenizer.apply_chat_template method.
    """
    context = sample['context']
    question = sample['question']
    prompts = {}

    # --- Vietnamese Instruction ---
    # Construct the message list for the chat template
    messages_vi = [
        {"role": "system", "content": "Bạn là một trợ lý y tế hữu ích. Dựa vào ngữ cảnh được cung cấp để trả lời câu hỏi."},
        {"role": "user", "content": f"Ngữ cảnh: {context}\n\nCâu hỏi: {question}"}
    ]
    # Apply the template
    prompts["VyLinh_ChatTemplate_VI"] = tokenizer.apply_chat_template(
        messages_vi,
        tokenize=False,
        add_generation_prompt=True # Crucial for telling the model to start its response
    )

    # --- English Instruction ---
    messages_en = [
        {"role": "system", "content": "You are a helpful medical assistant. Use the provided context to answer the question."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}
    ]
    prompts["VyLinh_ChatTemplate_EN"] = tokenizer.apply_chat_template(
        messages_en,
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompts

In [20]:
# Step 5: Generate Answers from Each Model
all_generated_answers = {}

if eval_dataset and hf_token:
    # Wrap each answer in a list to create the required List[List[str]] structure
    ground_truth_answers = [[sample['answer']] for sample in eval_dataset] 
    questions = [sample['question'] for sample in eval_dataset]
    
    # Loop through each model to generate answers
    for model_id in model_ids:
        print("\n" + "="*50)
        print(f"Loading model: {model_id}")
        print("="*50)

        model, tokenizer, text_generator = None, None, None

        try:
            # Load the tokenizer and model with 4-bit quantization to save memory
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=False,
            )
            tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                token=hf_token,
                quantization_config=bnb_config, # <-- PASS THE CONFIG OBJECT HERE
                device_map="auto",
                trust_remote_code=True
            )

            # Set up the text generation pipeline
            text_generator = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )

            # Determine which set of prompt functions to use
            if 'vietcuna' in model_id:
                # This model has no chat template, so we MUST format the string manually.
                prompt_function = create_vietcuna_prompts
                answer_start_tag = "ASSISTANT:"
            elif 'Arcee-VyLinh' in model_id:
                # This model uses the ChatML format, revealed by apply_chat_template.
                prompt_function = create_vylinh_prompts 
                answer_start_tag = "<|im_start|>assistant" # Corrected from previous versions
            # --------------------------
            else: # Default to Llama3
                # This model uses the Llama 3 chat format.
                prompt_function = create_llama3_prompts
                answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"

            # -Logic to select which prompts to run based on the toggle
            if USE_BEST_PROMPT_ONLY:
                # If the model has a defined best prompt, run only that one
                if model_id in BEST_PROMPTS:
                    prompt_variations = [BEST_PROMPTS[model_id]]
                    print(f"Mode: Best Prompt Only. Running with '{prompt_variations[0]}'")
                else:
                    print(f"Warning: No best prompt defined for {model_id}. Skipping.")
                    continue
            else:
                # Run all defined prompt variations for the model
                prompt_variations = prompt_function(eval_dataset[0], tokenizer).keys() 
                print(f"Mode: Exploration. Running all {len(prompt_variations)} prompt strategies.")

            for prompt_name in prompt_variations:
                print(f"\n--- Testing Prompt Strategy: {prompt_name} ---")
                
                # Generate all prompts for the current strategy
                prompts = [prompt_function(sample, tokenizer)[prompt_name] for sample in eval_dataset]

                print(f"Generating answers for {len(prompts)} prompts using {model_id} with '{prompt_name}' strategy...")
                # Generate answers for the entire batch
                generated_outputs_batch = text_generator(
                    prompts,
                    max_new_tokens=256,
                    do_sample=True,
                    temperature=0.1,
                    top_p=0.9,
                    eos_token_id=tokenizer.eos_token_id,
                    pad_token_id=tokenizer.eos_token_id, # Set pad_token_id to avoid warnings
                )
    
                # Extract the clean answers
                model_answers = []
                
                for i, output in enumerate(generated_outputs_batch):
                    generated_text = output[0]['generated_text']
                    # Use the appropriate start tag for splitting
                    if answer_start_tag in generated_text:
                        clean_answer = generated_text.split(answer_start_tag)[-1].strip()
                    else:
                        # Fallback remains the same
                        clean_answer = generated_text.replace(prompts[i], "").strip()
                    model_answers.append(clean_answer)

                # Create a unique key for each result set
                result_key = f"{model_id} ({prompt_name})"
    
                # Use the unique result_key to store the answers
                all_generated_answers[result_key] = model_answers
                print(f"Successfully generated answers for {result_key}.")

        except Exception as e:
            print(f"An error occurred while processing {model_id}: {e}")
        finally:
            # Check if variables were successfully created before deleting
            if model is not None: del model
            if tokenizer is not None: del tokenizer
            if text_generator is not None: del text_generator
            torch.cuda.empty_cache()

else:
    print("Skipping generation due to issues with the dataset or Hugging Face token.")


Loading model: alpha-ai/LLAMA3-3B-Medical-COT


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:1


Mode: Best Prompt Only. Running with 'Llama3_Direct_VI'

--- Testing Prompt Strategy: Llama3_Direct_VI ---
Generating answers for 1 prompts using alpha-ai/LLAMA3-3B-Medical-COT with 'Llama3_Direct_VI' strategy...
Successfully generated answers for alpha-ai/LLAMA3-3B-Medical-COT (Llama3_Direct_VI).

Loading model: vilm/vietcuna-3b-v2


config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Device set to use cuda:1


Mode: Best Prompt Only. Running with 'Vietcuna_Direct_NoExplain_VI'

--- Testing Prompt Strategy: Vietcuna_Direct_NoExplain_VI ---
Generating answers for 1 prompts using vilm/vietcuna-3b-v2 with 'Vietcuna_Direct_NoExplain_VI' strategy...
Successfully generated answers for vilm/vietcuna-3b-v2 (Vietcuna_Direct_NoExplain_VI).

Loading model: arcee-ai/Arcee-VyLinh


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.70G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


Mode: Best Prompt Only. Running with 'VyLinh_ChatTemplate_VI'

--- Testing Prompt Strategy: VyLinh_ChatTemplate_VI ---
Generating answers for 1 prompts using arcee-ai/Arcee-VyLinh with 'VyLinh_ChatTemplate_VI' strategy...
Successfully generated answers for arcee-ai/Arcee-VyLinh (VyLinh_ChatTemplate_VI).


In [21]:
# Step 6: Evaluate the Generated Answers
if all_generated_answers:
    # Load all the metrics we need
    rouge_metric = evaluate.load('rouge')
    bleu_metric = evaluate.load('bleu')
    meteor_metric = evaluate.load('meteor')
    bertscore_metric = evaluate.load('bertscore')

    evaluation_results = []

    print("\n" + "="*50)
    print("Calculating Evaluation Metrics")
    print("="*50)

    for result_key, predictions in all_generated_answers.items():
        print(f"\n--- Evaluating {result_key} ---")
    
        # Check for empty predictions to prevent ZeroDivisionError in BLEU ---
        # The `any()` function returns False if all strings in the list are empty.
        if not any(predictions):
            print(f"  WARNING: Model & Prompt Strategy '{result_key}' produced empty answers for all samples. Assigning all metric scores to 0.")
            result_row = {
                "Model & Prompt Strategy": result_key,
                "ROUGE-L": 0.0,
                "BLEU": 0.0,
                "METEOR": 0.0,
                "BERTScore-F1": 0.0
            }
            evaluation_results.append(result_row)
            # Use `continue` to skip the rest of the loop and move to the next model
            continue
    
        # If predictions are valid, compute metrics as normal
        rouge_scores = rouge_metric.compute(predictions=predictions, references=ground_truth_answers)
        bleu_scores = bleu_metric.compute(predictions=predictions, references=ground_truth_answers)
        meteor_scores = meteor_metric.compute(predictions=predictions, references=ground_truth_answers)
        bertscore_scores = bertscore_metric.compute(predictions=predictions, references=ground_truth_answers, lang="vi")
    
        # Store results (this part is the same as before)
        result_row = {
            "Model & Prompt Strategy": result_key,
            "ROUGE-L": round(rouge_scores['rougeL'], 4),
            "BLEU": round(bleu_scores['bleu'], 4),
            "METEOR": round(meteor_scores['meteor'], 4),
            "BERTScore-F1": round(sum(bertscore_scores['f1']) / len(bertscore_scores['f1']), 4)
        }
        evaluation_results.append(result_row)

    # Step 7: Display Results
    results_df = pd.DataFrame(evaluation_results)
    # Sort for better comparison
    results_df = results_df.sort_values(by="BERTScore-F1", ascending=False).reset_index(drop=True)
    print("\n--- Comparative Evaluation Results ---")
    display(results_df)

    # # Example generation
    # print("\n--- Example Generations ---")
    # example_df_data = {
    #     "Question": questions[:3],
    #     "Ground Truth": [gt[0] for gt in ground_truth_answers[:3]] # Unpack the list for cleaner display
    # }
    # # Use the new result_key for column headers
    # for result_key, answers in all_generated_answers.items():
    #     example_df_data[f"Answer: {result_key}"] = answers[:3]

    # example_df = pd.DataFrame(example_df_data)
    # display(example_df)

else:
    print("\nNo answers were generated. Skipping evaluation.")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Downloading builder script: 0.00B [00:00, ?B/s]


Calculating Evaluation Metrics

--- Evaluating alpha-ai/LLAMA3-3B-Medical-COT (Llama3_Direct_VI) ---


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]


--- Evaluating vilm/vietcuna-3b-v2 (Vietcuna_Direct_NoExplain_VI) ---

--- Evaluating arcee-ai/Arcee-VyLinh (VyLinh_ChatTemplate_VI) ---

--- Comparative Evaluation Results ---


Unnamed: 0,Model & Prompt Strategy,ROUGE-L,BLEU,METEOR,BERTScore-F1
0,alpha-ai/LLAMA3-3B-Medical-COT (Llama3_Direct_VI),1.0,1.0,0.9998,1.0
1,vilm/vietcuna-3b-v2 (Vietcuna_Direct_NoExplain...,0.597,0.3305,0.873,0.892
2,arcee-ai/Arcee-VyLinh (VyLinh_ChatTemplate_VI),0.4255,0.1553,0.7473,0.7747
