In [None]:
!pip install -q transformers datasets accelerate bitsandbytes torch ragas evaluate rouge_score nltk

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset, Dataset
from kaggle_secrets import UserSecretsClient
import random
import pandas as pd

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_similarity,
    answer_correctness,
)
from langchain_openai import ChatOpenAI
import os

In [None]:
from evaluate import load
import nltk

# Load the metrics and handle potential errors
try:
    print("Loading ROUGE metric...")
    rouge = load('rouge')
    print("[SUCCESS] ROUGE metric loaded.")
    
    print("\nLoading METEOR metric...")
    meteor = load('meteor')
    print("[SUCCESS] METEOR metric loaded.")
    
    # METEOR requires the 'wordnet' corpus. We download it here to be safe.
    print("\nDownloading 'wordnet' for METEOR...")
    nltk.download('wordnet')
    print("[SUCCESS] NLTK 'wordnet' downloaded.")
    
except Exception as e:
    print(f"\n[ERROR] Failed to load Hugging Face 'evaluate' metrics.")
    print("        Please check your notebook's internet connection and ensure all libraries are installed.")
    print(f"        Underlying error: {e}")
    rouge = None
    meteor = None

In [None]:
# Step 2: Authenticate with Hugging Face and OpenAI
# We retrieve the tokens you stored in Kaggle Secrets.

# Hugging Face Token for the generation model
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGING_FACE_TOKEN")
except Exception as e:
    print("Could not retrieve Hugging Face token. Please ensure it is stored as a Kaggle secret named 'HUGGING_FACE_TOKEN'.")
    hf_token = None

# OpenAI API Key for the Ragas evaluation model
try:
    openai_api_key = user_secrets.get_secret("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = openai_api_key
    # print("OpenAI API Key configured successfully.")
except Exception as e:
    print("Could not retrieve OpenAI API Key. Please ensure it is stored as a Kaggle secret named 'OPENAI_API_KEY'.")
    openai_api_key = None

In [None]:
# Step 3: Define Model and Dataset Identifiers
model_id = "manhtt-079/vipubmed-deberta-xsmall"
dataset_id = "tmnam20/ViMedAQA"

# Step 4: Load the Dataset
# We load the ViMedAQA dataset from Hugging Face.
# This dataset contains Vietnamese medical questions and answers.
try:
    dataset = load_dataset(dataset_id, split="train")
    print("Dataset loaded successfully!")
    print("Example from the dataset:")
    print(dataset[0])
except Exception as e:
    print(f"Failed to load the dataset. Error: {e}")
    dataset = None

# Step 5: Load the Model and Tokenizer
# We will load the model in 4-bit precision (quantization) to save memory,
# which is highly recommended for running larger models on Kaggle's GPUs.
if hf_token:
    try:
        # Note the change to AutoModelForQuestionAnswering
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForQuestionAnswering.from_pretrained(model_id)
        print("Extractive QA model and tokenizer loaded successfully!")
        
    except Exception as e:
        print(f"Failed to load the model or tokenizer. Error: {e}")
        model = None
        tokenizer = None
else:
    print("Hugging Face token not available. Cannot load the model.")
    model = None
    tokenizer = None

In [None]:
# Step 6: Set up a Text Generation Pipeline and Prepare for Evaluation
if model and tokenizer and dataset:
    # The pipeline simplifies the process of using the model for extractive QA.
    # We explicitly send it to the GPU (device=0).
    qa_pipeline = pipeline(
        "question-answering",
        model=model,
        tokenizer=tokenizer,
        device=0 # Use 0 for the first GPU
    )
    print("Question Answering pipeline is ready.")

    # Select samples for evaluation
    num_samples_to_evaluate = 3 
    random_indices = random.sample(range(len(dataset)), num_samples_to_evaluate)
    eval_dataset = dataset.select(random_indices)

    # These lists will store the data needed for evaluation
    ids_for_ragas = []
    questions_for_ragas = []
    contexts_for_ragas = []
    ground_truths_for_ragas = []
    generated_answers_for_ragas = [] # We'll call them 'generated' to match the old variable name

    print(f"\nExtracting answers for {num_samples_to_evaluate} random samples...")

    # Loop through each sample and extract the answer
    for i, sample in enumerate(eval_dataset):
        question = sample['question']
        context = sample['context']
        
        # The QA pipeline takes a question and context directly
        result = qa_pipeline(question=question, context=context)
        
        # Store the results for our evaluation frameworks
        ids_for_ragas.append(sample['question_idx'])
        questions_for_ragas.append(question)
        contexts_for_ragas.append([context]) # Ragas expects a list
        ground_truths_for_ragas.append(sample['answer'])
        generated_answers_for_ragas.append(result['answer']) # The pipeline returns a dict with the answer

        # Print a few examples to see the model's performance
        if i < 3:
            print(f"\n--- Sample {i+1}/{num_samples_to_evaluate} ---")
            print(f"Sample ID: {sample['question_idx']}")
            print(f"Question: {question}")
            # print(f"Context: {context}") # Optional: uncomment to see the full context
            print(f"Model Answer (Extracted): {result['answer']}")
            print(f"Confidence Score: {result['score']:.4f}")
            print(f"Ground Truth: {sample['answer']}")
            print("-" * 50)
            
    print("\n--- Model Extraction Complete ---")

else:
    print("\nSkipping extraction due to issues with model/tokenizer/dataset loading.")

In [None]:
# --- NEW CELL TO BE ADDED (with debugging) ---

# Step 8.5: Calculate ROUGE and METEOR scores with Debugging

print("\n--- [DEBUG] Pre-calculation Check for ROUGE/METEOR ---")

# --- Debugging Checks ---
# 1. Check if the necessary lists from the generation step exist
generated_answers_exist = 'generated_answers_for_ragas' in locals()
ground_truths_exist = 'ground_truths_for_ragas' in locals()
print(f"[CHECK] 'generated_answers_for_ragas' list exists: {generated_answers_exist}")
print(f"[CHECK] 'ground_truths_for_ragas' list exists:   {ground_truths_exist}")

# 2. Check if the metric objects were loaded correctly
rouge_loaded = 'rouge' in locals() and rouge is not None
meteor_loaded = 'meteor' in locals() and meteor is not None
print(f"[CHECK] ROUGE metric object loaded:  {rouge_loaded}")
print(f"[CHECK] METEOR metric object loaded: {meteor_loaded}")

# 3. Perform the final check to decide whether to proceed
all_checks_passed = generated_answers_exist and ground_truths_exist and rouge_loaded and meteor_loaded

if all_checks_passed:
    # --- Additional checks on the data itself ---
    predictions = generated_answers_for_ragas
    references = ground_truths_for_ragas
    
    if len(predictions) > 0 and len(predictions) == len(references):
        print("\n[SUCCESS] All checks passed. Starting metric calculation...")
        print("-" * 50)
        
        try:
            # The compute method can handle a list of predictions and references at once
            rouge_results = rouge.compute(predictions=predictions, references=references)
            meteor_results = meteor.compute(predictions=predictions, references=references)

            # --- Display Individual Scores (for detailed analysis) ---
            print("\n--- Individual Sample Scores ---")
            for i in range(len(predictions)):
                single_rouge = rouge.compute(predictions=[predictions[i]], references=[references[i]])
                single_meteor = meteor.compute(predictions=[predictions[i]], references=[references[i]])
                
                print(f"\n--- Sample {i+1} ('{ids_for_ragas[i]}') ---")
                print(f"Model Answer: {predictions[i]}")
                print(f"Ground Truth: {references[i]}")
                print(f"  ROUGE-1: {single_rouge['rouge1']:.4f}")
                print(f"  ROUGE-2: {single_rouge['rouge2']:.4f}")
                print(f"  ROUGE-L: {single_rouge['rougeL']:.4f}")
                print(f"  METEOR:  {single_meteor['meteor']:.4f}")
                print("-" * 30)

            # --- Display Average Scores (for overall benchmark) ---
            print("\n--- Average Scores Across All Samples ---")
            print(f"Average ROUGE-1: {rouge_results['rouge1']:.4f}")
            print(f"Average ROUGE-2: {rouge_results['rouge2']:.4f}")
            print(f"Average ROUGE-L: {rouge_results['rougeL']:.4f}")
            print(f"Average METEOR:  {meteor_results['meteor']:.4f}")

        except Exception as e:
            print(f"\n[ERROR] An unexpected error occurred during metric calculation: {e}")

    else:
        # This handles cases where generation ran but produced no output
        print("\n[ERROR] Calculation skipped. The prediction/reference lists are empty or have mismatched lengths.")
        print(f"        Number of predictions: {len(predictions)}")
        print(f"        Number of references:  {len(references)}")

else:
    # This block runs if the initial checks fail, providing specific guidance
    print("\n[ERROR] Calculation skipped due to failed checks. Please review the messages above.")
    if not generated_answers_exist or not ground_truths_exist:
        print(" -> FIX: Ensure the previous cell (Step 8: Generate Answers) ran without errors and successfully created the 'generated_answers_for_ragas' and 'ground_truths_for_ragas' lists.")
    if not rouge_loaded or not meteor_loaded:
        print(" -> FIX: Scroll up to the second code cell (where libraries are imported) and check for any error messages when loading the metrics. Ensure your Kaggle notebook has internet enabled.")

# Debugging a single data point

In [None]:
# # Step 6 & 7: Isolate and Prepare a SINGLE Sample for Debugging

# # ---------------------------------------------------------------------------------
# # USER ACTION: CHANGE THIS ID to the specific sample you want to debug.
# TARGET_SAMPLE_ID = "body-part_1140" 
# # ---------------------------------------------------------------------------------

# if model and tokenizer and dataset:
#     # Find the specific sample in the dataset
#     target_sample = None
#     for sample in dataset:
#         if sample['question_idx'] == TARGET_SAMPLE_ID:
#             target_sample = sample
#             break
            
#     if target_sample:
#         print(f"Found sample with ID: {TARGET_SAMPLE_ID}")
        
#         # Create a new mini-dataset containing only our target sample
#         # Ragas and other functions expect a Dataset object, so we build one.
#         eval_data_dict = {key: [value] for key, value in target_sample.items()}
#         eval_dataset = Dataset.from_dict(eval_data_dict)
        
#         # --- The rest of the pipeline now runs on this single sample ---
        
#         text_generator = pipeline(
#             "text-generation", model=model, tokenizer=tokenizer,
#             torch_dtype=torch.bfloat16, device_map="auto"
#         )
#         print("Text generation pipeline is ready.")

#         # Prepare prompts and data lists (now they will only have one item)
#         prompts, ids_for_ragas, questions_for_ragas, contexts_for_ragas, ground_truths_for_ragas = [], [], [], [], []
#         for sample in eval_dataset:
#             ids_for_ragas.append(sample['question_idx'])
#             questions_for_ragas.append(sample['question'])
#             contexts_for_ragas.append([sample['context']])
#             ground_truths_for_ragas.append(sample['answer'])
#             prompt_template = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

#             Based on the context below, answer the question. 
            
#             **Rules:**
#             1. You MUST extract the answer directly from the context.
#             2. The answer must be the exact, continuous text from the context.
#             3. DO NOT add extra words or form a full sentence.
            
#             **Example:**
#             - Context: "Đến năm 1327, đây là thị trấn lớn thứ ba tại Warwickshire."
#             - Question: "Vào thế kỉ XIV, Birmingham trở thành thị trấn lớn thứ mấy tại Warwickshire?"
#             - Correct Answer: "lớn thứ ba"

#             **Now, perform the task with the following:**
            
#             Context: {sample['context']}
            
#             Question: {sample['question']}

#             <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
#             prompts.append(prompt_template)

#         print(f"\nPreparing to generate an answer for sample {TARGET_SAMPLE_ID}...")

#         # Step 8: Generate the single answer
#         try:
#             generated_output = text_generator(
#                 # prompts, max_new_tokens=256, do_sample=True, temperature=0.1, top_p=0.9,
#                 prompts, max_new_tokens=256, do_sample=False, temperature=0.0, # Temperature 0.0 for deterministic extraction
#                 eos_token_id=tokenizer.eos_token_id, padding=True, truncation=True
#             )[0] # Get the first and only result

#             generated_answers_for_ragas = []
#             answer_start_tag = "<|start_header_id|>assistant<|end_header_id|>"
#             clean_answer = generated_output[0]['generated_text'].split(answer_start_tag)[-1].strip()
#             generated_answers_for_ragas.append(clean_answer)

#             print("\n--- Model Generation Complete ---")
#             print(f"Sample ID: {ids_for_ragas[0]}")
#             print(f"Question: {questions_for_ragas[0]}")
#             print(f"Model Answer: {clean_answer}")
#             print(f"Ground Truth: {ground_truths_for_ragas[0]}")
            
#         except Exception as e:
#             print(f"An error occurred during text generation: {e}")
            
#     else:
#         print(f"ERROR: Could not find any sample with ID '{TARGET_SAMPLE_ID}' in the dataset.")

# else:
#     print("\nSkipping generation due to issues with model, tokenizer, or dataset loading.")

In [None]:
# Step 9: Prepare the Dataset for Ragas Evaluation
# Ragas expects a Hugging Face Dataset object with specific column names:
# - question: The question asked.
# - contexts: A list of context strings.
# - answer: The answer generated by the model.
# - ground_truth: The reference answer from the original dataset.

if 'generated_answers_for_ragas' in locals():
    # Create a dictionary with the collected data
    ragas_data = {
        "question": questions_for_ragas,
        "contexts": contexts_for_ragas,
        "answer": generated_answers_for_ragas,
        "ground_truth": ground_truths_for_ragas
    }

    # Convert the dictionary to a Hugging Face Dataset
    ragas_dataset = Dataset.from_dict(ragas_data)

    print("Dataset prepared for Ragas evaluation.")
    print(ragas_dataset)

else:
    print("Could not find generated answers. Skipping Ragas evaluation.")
    ragas_dataset = None

In [None]:
# Step 10: Run Ragas with a Robust, Per-Sample, Sequential Evaluation

# Disable debug mode for a cleaner output
import langchain
langchain.debug = False

# Import Python's built-in warnings module
import warnings
import pandas as pd
from tqdm.auto import tqdm # Import tqdm for a progress bar

if 'generated_answers_for_ragas' in locals() and openai_api_key:
    # Prepare the dataset for Ragas
    ragas_data = {
        "question": questions_for_ragas,
        "contexts": contexts_for_ragas,
        "answer": generated_answers_for_ragas,
        "ground_truth": ground_truths_for_ragas
    }
    ragas_dataset = Dataset.from_dict(ragas_data)
    
    print("\nStarting robust, per-sample Ragas evaluation...")
    print("="*50)

    # Configure the judge LLM
    evaluation_llm = ChatOpenAI(model="gpt-4.1-nano")

    # Define the metrics we want to compute
    metrics_to_run = [
        faithfulness,
        answer_relevancy,
        answer_similarity,
        answer_correctness,
    ]
    
    # --- This is the robust loop ---
    all_sample_results = []
    # Loop 1: Iterate through each sample in the dataset
    for sample in tqdm(ragas_dataset, desc="Evaluating Samples"):
        # Create a mini-dataset with just the current sample
        single_sample_dataset = Dataset.from_dict({k: [v] for k, v in sample.items()})
        
        # Dictionary to store all scores for the current sample
        sample_scores = {"question": sample["question"]}
        
        # Loop 2: Evaluate each metric sequentially for the current sample
        for metric in metrics_to_run:
            metric_name = metric.name
            try:
                # Run evaluation for only ONE metric on the ONE sample
                result = evaluate(
                    dataset=single_sample_dataset,
                    metrics=[metric],
                    llm=evaluation_llm
                )
                # Store the successful score
                sample_scores[metric_name] = result[metric_name]
            except Exception as e:
                # If a metric fails, record it as NaN and continue
                print(f"  WARNING: Metric '{metric_name}' failed for question '{sample['question'][:50]}...'. Recording as NaN. Error: {e}")
                sample_scores[metric_name] = float('nan') # Explicitly set NaN on failure
            
        all_sample_results.append(sample_scores)


    print("\n" + "="*50)
    print("Ragas per-sample evaluation complete!")
    print("="*50 + "\n")

    # --- Display the results in a clean DataFrame ---
    # This will now work correctly because all_sample_results is a list of dictionaries,
    # and we have handled any potential failures by explicitly setting NaN.
    results_df = pd.DataFrame(all_sample_results)
    
    # Reorder columns for better readability
    column_order = ['question', 'faithfulness', 'answer_relevancy', 'answer_similarity', 'answer_correctness']
    # Ensure all expected columns exist, adding them with NaN if they are missing
    for col in column_order:
        if col not in results_df.columns:
            results_df[col] = float('nan')
            
    results_df = results_df[column_order]

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        display(results_df)

    # --- Optional: Print the final average scores ---
    print("\n--- Average Ragas Scores ---")
    average_scores = results_df.mean(numeric_only=True)
    print(average_scores)


else:
    print("Skipping Ragas evaluation. Check if generation was successful and if the OpenAI API Key is configured.")