In [8]:
!pip install -q transformers datasets accelerate bitsandbytes torch ragas

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, Dataset
from kaggle_secrets import UserSecretsClient
import random
import pandas as pd

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_similarity,
    answer_correctness,
)
from langchain_openai import ChatOpenAI
import os

In [10]:
# Step 2: Authenticate with Hugging Face and OpenAI
# We retrieve the tokens you stored in Kaggle Secrets.

# Hugging Face Token for the generation model
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGING_FACE_TOKEN")
except Exception as e:
    print("Could not retrieve Hugging Face token. Please ensure it is stored as a Kaggle secret named 'HUGGING_FACE_TOKEN'.")
    hf_token = None

# OpenAI API Key for the Ragas evaluation model
try:
    openai_api_key = user_secrets.get_secret("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_api_key
    # print("OpenAI API Key configured successfully.")
except Exception as e:
    print("Could not retrieve OpenAI API Key. Please ensure it is stored as a Kaggle secret named 'OPENAI_API_KEY'.")
    openai_api_key = None

In [11]:
# Step 3: Define Model and Dataset Identifiers
model_id = "alpha-ai/LLAMA3-3B-Medical-COT"
dataset_id = "tmnam20/ViMedAQA"

# Step 4: Load the Dataset
# We load the ViMedAQA dataset from Hugging Face.
# This dataset contains Vietnamese medical questions and answers.
try:
    dataset = load_dataset(dataset_id, split="train")
    print("Dataset loaded successfully!")
    print("Example from the dataset:")
    print(dataset[0])
except Exception as e:
    print(f"Failed to load the dataset. Error: {e}")
    dataset = None

# Step 5: Load the Model and Tokenizer
# We will load the model in 4-bit precision (quantization) to save memory,
# which is highly recommended for running larger models on Kaggle's GPUs.
if hf_token:
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            token=hf_token,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            quantization_config=None  # You can add a BitsAndBytesConfig here for quantization if needed
        )
        print("Model and tokenizer loaded successfully!")
    except Exception as e:
        print(f"Failed to load the model or tokenizer. Error: {e}")
        model = None
        tokenizer = None
else:
    print("Hugging Face token not available. Cannot load the model.")
    model = None
    tokenizer = None

Dataset loaded successfully!
Example from the dataset:
{'question_idx': 'drug_6073', 'question': 'Biviantac có thể điều trị trướng bụng, đầy hơi không?', 'answer': 'Có, Biviantac có thể điều trị các tình trạng như trướng bụng, đầy hơi, ợ nóng, ợ hơi hay ợ chua.', 'context': 'Thuốc Biviantac được chỉ định để điều trị các trường hợp do tăng tiết acid quá mức như: - Khó tiêu, nóng rát hay đau vùng thượng vị.\n- Trướng bụng, đầy hơi, ợ nóng, ợ hơi hay ợ chua.\n- Tăng độ acid, đau rát dạ dày.\n- Các rối loạn thường gặp trong những bệnh lý loét dạ dày tá tràng, thực quản.', 'title': 'Chỉ định của thuốc Biviantac', 'keyword': 'Biviantac', 'topic': 2, 'article_url': 'https://youmed.vn/tin-tuc/thuoc-biviantac-thuoc-dung-cho-cac-roi-loan-tieu-hoa/', 'author': 'Dược sĩ Trần Vân Thy', 'author_url': 'https://youmed.vn/tin-tuc/bac-si/duoc-si-tran-van-thy/'}
Model and tokenizer loaded successfully!


In [18]:
# Step 6: Set up a Text Generation Pipeline and Prepare for Evaluation
if model and tokenizer:
    # The pipeline simplifies the process of using the model for text generation.
    text_generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    print("Text generation pipeline is ready.")

    # Step 7: Select Samples, Format Prompts, and Collect Data for Ragas
    if dataset:
        num_samples_to_evaluate = 3 # Ragas can be slow, so let's start with a small, representative sample.

        # Generate N random indices to select samples from the dataset
        random_indices = random.sample(range(len(dataset)), num_samples_to_evaluate)
        eval_dataset = dataset.select(random_indices)

        prompts = []
        # These lists will store the data needed for Ragas evaluation
        ids_for_ragas = []
        questions_for_ragas = []
        contexts_for_ragas = []
        ground_truths_for_ragas = []

        for sample in eval_dataset:
            question_id = sample['question_idx']
            question = sample['question']
            context = sample['context']
            original_answer = sample['answer']

            # Store data for Ragas
            ids_for_ragas.append(question_id)
            questions_for_ragas.append(question)
            contexts_for_ragas.append([context]) # Ragas expects context to be a list of strings
            ground_truths_for_ragas.append(original_answer)

            # Format the prompt for the model
            prompt_template = f"""
            <|begin_of_text|><|start_header_id|>user<|end_header_id|>

            You are a helpful medical assistant. Based *only* on the context provided below, answer the question in Vietnamese.

            Context: {context}

            Question: {question}

            <|eot_id|><|start_header_id|>assistant<|end_header_id|>
            """
            prompts.append(prompt_template)

        print(f"\nPreparing to generate answers for {num_samples_to_evaluate} random samples...")

        # Step 8: Generate Answers for the entire batch
        try:
            generated_outputs_batch = text_generator(
                prompts,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.01,
                top_p=0.9,
                eos_token_id=tokenizer.eos_token_id,
                padding=True,
                truncation=True
            )

            # Collect the generated answers for Ragas
            generated_answers_for_ragas = []
            answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"

            print("\n--- Model Generation Complete. Sample Outputs: ---")
            for i, output in enumerate(generated_outputs_batch):
                generated_text = output[0]['generated_text']
                clean_answer = generated_text.split(answer_start_tag)[-1].strip()
                generated_answers_for_ragas.append(clean_answer)

                # Print a few examples to see the model's performance
                if i < 3: # Print first 3 examples
                    print(f"\n--- Sample {i+1}/{num_samples_to_evaluate} ---")
                    print(f"Sample ID: {ids_for_ragas[i]}") 
                    print(f"Question: {questions_for_ragas[i]}")
                    print(f"Context: {contexts_for_ragas[i]}")
                    print(f"Model Answer: {clean_answer}")
                    print(f"Ground Truth: {ground_truths_for_ragas[i]}")
                    print("-" * 50)

        except Exception as e:
            print(f"An error occurred during text generation: {e}")
else:
    print("\nSkipping text generation due to issues with model/tokenizer loading.")

Device set to use cuda:0


Text generation pipeline is ready.

Preparing to generate answers for 3 random samples...

--- Model Generation Complete. Sample Outputs: ---

--- Sample 1/3 ---
Sample ID: disease_8483
Question: Khi nào thì thai to xảy ra nhiều nhất?
Context: ['- Tiền sử sinh con to: Thai to sẽ có có nguy cơ hơn khi trước đây bạn đã từng có đứa con trước đó với cân nặng lúc sanh trên 4000 gram.\n- Di truyền: Nếu bản thân cha mẹ trước đây cũng từng được chẩn đoán là con to lúc sanh, cũng gây nguy cơ sinh con to sau này.\n- Giới tính: Bé trai thường nặng hơn so với bé gái. Hầu hết các em bé sinh ra với cân nặng lớn đều là bé trai.\n- Tuổi mẹ: Khi mang thai trên 35 tuổi, có thể tăng khả năng sinh con to.\n- Mang thai quá ngày (Thai già tháng): Thai trong bụng mẹ vượt quá ngày dự sanh nhiều ngày có thể dẫn đến thai to hơn. Tuy nhiên, điều này lại ít gặp trên thực tế.']
Model Answer: Khi thai mẹ mang thai quá ngày (thai già tháng), thai to xảy ra nhiều nhất.
Ground Truth: Ít gặp trên thực tế.
-------------

# Debugging a single data point

In [19]:
# # Step 6 & 7: Isolate and Prepare a SINGLE Sample for Debugging

# # ---------------------------------------------------------------------------------
# # USER ACTION: CHANGE THIS ID to the specific sample you want to debug.
# TARGET_SAMPLE_ID = "body-part_1140" 
# # ---------------------------------------------------------------------------------

# if model and tokenizer and dataset:
#     # Find the specific sample in the dataset
#     target_sample = None
#     for sample in dataset:
#         if sample['question_idx'] == TARGET_SAMPLE_ID:
#             target_sample = sample
#             break
            
#     if target_sample:
#         print(f"Found sample with ID: {TARGET_SAMPLE_ID}")
        
#         # Create a new mini-dataset containing only our target sample
#         # Ragas and other functions expect a Dataset object, so we build one.
#         eval_data_dict = {key: [value] for key, value in target_sample.items()}
#         eval_dataset = Dataset.from_dict(eval_data_dict)
        
#         # --- The rest of the pipeline now runs on this single sample ---
        
#         text_generator = pipeline(
#             "text-generation", model=model, tokenizer=tokenizer,
#             torch_dtype=torch.bfloat16, device_map="auto"
#         )
#         print("Text generation pipeline is ready.")

#         # Prepare prompts and data lists (now they will only have one item)
#         prompts, ids_for_ragas, questions_for_ragas, contexts_for_ragas, ground_truths_for_ragas = [], [], [], [], []
#         for sample in eval_dataset:
#             ids_for_ragas.append(sample['question_idx'])
#             questions_for_ragas.append(sample['question'])
#             contexts_for_ragas.append([sample['context']])
#             ground_truths_for_ragas.append(sample['answer'])
#             prompt_template = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

#             Based on the context below, answer the question. 
            
#             **Rules:**
#             1. You MUST extract the answer directly from the context.
#             2. The answer must be the exact, continuous text from the context.
#             3. DO NOT add extra words or form a full sentence.
            
#             **Example:**
#             - Context: "Đến năm 1327, đây là thị trấn lớn thứ ba tại Warwickshire."
#             - Question: "Vào thế kỉ XIV, Birmingham trở thành thị trấn lớn thứ mấy tại Warwickshire?"
#             - Correct Answer: "lớn thứ ba"

#             **Now, perform the task with the following:**
            
#             Context: {sample['context']}
            
#             Question: {sample['question']}

#             <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
#             prompts.append(prompt_template)

#         print(f"\nPreparing to generate an answer for sample {TARGET_SAMPLE_ID}...")

#         # Step 8: Generate the single answer
#         try:
#             generated_output = text_generator(
#                 # prompts, max_new_tokens=256, do_sample=True, temperature=0.1, top_p=0.9,
#                 prompts, max_new_tokens=256, do_sample=False, temperature=0.0, # Temperature 0.0 for deterministic extraction
#                 eos_token_id=tokenizer.eos_token_id, padding=True, truncation=True
#             )[0] # Get the first and only result

#             generated_answers_for_ragas = []
#             answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"
#             clean_answer = generated_output[0]['generated_text'].split(answer_start_tag)[-1].strip()
#             generated_answers_for_ragas.append(clean_answer)

#             print("\n--- Model Generation Complete ---")
#             print(f"Sample ID: {ids_for_ragas[0]}")
#             print(f"Question: {questions_for_ragas[0]}")
#             print(f"Model Answer: {clean_answer}")
#             print(f"Ground Truth: {ground_truths_for_ragas[0]}")
            
#         except Exception as e:
#             print(f"An error occurred during text generation: {e}")
            
#     else:
#         print(f"ERROR: Could not find any sample with ID '{TARGET_SAMPLE_ID}' in the dataset.")

# else:
#     print("\nSkipping generation due to issues with model, tokenizer, or dataset loading.")

In [20]:
# Step 9: Prepare the Dataset for Ragas Evaluation
# Ragas expects a Hugging Face Dataset object with specific column names:
# - question: The question asked.
# - contexts: A list of context strings.
# - answer: The answer generated by the model.
# - ground_truth: The reference answer from the original dataset.

if 'generated_answers_for_ragas' in locals():
    # Create a dictionary with the collected data
    ragas_data = {
        "question": questions_for_ragas,
        "contexts": contexts_for_ragas,
        "answer": generated_answers_for_ragas,
        "ground_truth": ground_truths_for_ragas
    }

    # Convert the dictionary to a Hugging Face Dataset
    ragas_dataset = Dataset.from_dict(ragas_data)

    print("Dataset prepared for Ragas evaluation.")
    print(ragas_dataset)

else:
    print("Could not find generated answers. Skipping Ragas evaluation.")
    ragas_dataset = None

Dataset prepared for Ragas evaluation.
Dataset({
    features: ['question', 'contexts', 'answer', 'ground_truth'],
    num_rows: 3
})


In [29]:
# Step 10: Run Ragas with a Robust, Per-Sample, Sequential Evaluation

# Disable debug mode for a cleaner output
import langchain
langchain.debug = False

# Import Python's built-in warnings module
import warnings
import pandas as pd
from tqdm.auto import tqdm # Import tqdm for a progress bar

if 'generated_answers_for_ragas' in locals() and openai_api_key:
    # Prepare the dataset for Ragas
    ragas_data = {
        "question": questions_for_ragas,
        "contexts": contexts_for_ragas,
        "answer": generated_answers_for_ragas,
        "ground_truth": ground_truths_for_ragas
    }
    ragas_dataset = Dataset.from_dict(ragas_data)
    
    print("\nStarting robust, per-sample Ragas evaluation...")
    print("="*50)

    # Configure the judge LLM
    evaluation_llm = ChatOpenAI(model="gpt-4.1-nano")

    # Define the metrics we want to compute
    metrics_to_run = [
        faithfulness,
        answer_relevancy,
        answer_similarity,
        answer_correctness,
    ]
    
    # --- This is the robust loop ---
    all_sample_results = []
    # Loop 1: Iterate through each sample in the dataset
    for sample in tqdm(ragas_dataset, desc="Evaluating Samples"):
        # Create a mini-dataset with just the current sample
        single_sample_dataset = Dataset.from_dict({k: [v] for k, v in sample.items()})
        
        # Dictionary to store all scores for the current sample
        sample_scores = {"question": sample["question"]}
        
        # Loop 2: Evaluate each metric sequentially for the current sample
        for metric in metrics_to_run:
            metric_name = metric.name
            try:
                # Run evaluation for only ONE metric on the ONE sample
                result = evaluate(
                    dataset=single_sample_dataset,
                    metrics=[metric],
                    llm=evaluation_llm
                )
                # Store the successful score
                sample_scores[metric_name] = result[metric_name]
            except Exception as e:
                # If a metric fails, record it as NaN and continue
                print(f"  WARNING: Metric '{metric_name}' failed for question '{sample['question'][:50]}...'. Recording as NaN. Error: {e}")
                sample_scores[metric_name] = float('nan') # Explicitly set NaN on failure
            
        all_sample_results.append(sample_scores)


    print("\n" + "="*50)
    print("Ragas per-sample evaluation complete!")
    print("="*50 + "\n")

    # --- Display the results in a clean DataFrame ---
    # This will now work correctly because all_sample_results is a list of dictionaries,
    # and we have handled any potential failures by explicitly setting NaN.
    results_df = pd.DataFrame(all_sample_results)
    
    # Reorder columns for better readability
    column_order = ['question', 'faithfulness', 'answer_relevancy', 'answer_similarity', 'answer_correctness']
    # Ensure all expected columns exist, adding them with NaN if they are missing
    for col in column_order:
        if col not in results_df.columns:
            results_df[col] = float('nan')
            
    results_df = results_df[column_order]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        display(results_df)

    # --- Optional: Print the final average scores ---
    print("\n--- Average Ragas Scores ---")
    average_scores = results_df.mean(numeric_only=True)
    print(average_scores)


else:
    print("Skipping Ragas evaluation. Check if generation was successful and if the OpenAI API Key is configured.")


Starting robust, per-sample Ragas evaluation...


Evaluating Samples:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]


Ragas per-sample evaluation complete!



Unnamed: 0,question,faithfulness,answer_relevancy,answer_similarity,answer_correctness
0,Khi nào thì thai to xảy ra nhiều nhất?,[0.0],[0.7350011575223752],[0.7898303282982233],[0.19745758207455583]
1,Viêm thực quản là tình trạng như thế nào?,[0.75],[0.93895725709794],[0.957778699510968],[0.5394446748777421]
2,Thành phần chính của Chronol là gì?,[1.0],[0.8857837882408912],[0.8852616106426497],[0.22131540266066244]



--- Average Ragas Scores ---
Series([], dtype: float64)
