In [None]:
"""
This script conducts a comparative evaluation of a Retrieval-Augmented Generation 
(RAG) system against a baseline Large Language Model (LLM). It assesses their 
performance on two distinct general knowledge question-answering datasets: 
TriviaQA and TruthfulQA.

The script queries both the RAG system and the base LLM with a subset of 
questions from each dataset, records their answers and response times, and saves
the comprehensive results into two separate CSV files for subsequent analysis.
"""

In [2]:
# ==================================================================================================
# --- 1. Import Dependencies ---
# ==================================================================================================
import pandas as pd
import time
import ollama
import logging
import sys
from pathlib import Path
from types import SimpleNamespace
from datasets import load_dataset


# --- Define Project Root ---
project_code_root = Path.cwd().parent.resolve()
project_root = Path.cwd().parent.parent.resolve()

if str(project_code_root) not in sys.path:
        sys.path.append(str(project_code_root))
    
from core.rag_setup import setup_chatbot
from core import config

In [3]:
# ==================================================================================================
# --- 2. Logging Configuration ---
# ==================================================================================================
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)


In [4]:
# ==================================================================================================
# --- 3. Core Evaluation Functions ---
# ==================================================================================================

def query_base_llm(question: str, model_name: str) -> str:
    """Sends a question directly to a specified baseline LLM via Ollama."""
    try:
        # Uses the ollama.chat function for a direct, non-RAG query.
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': question}]
        )
        return response['message']['content']
    except Exception as e:
        logging.error(f"Error querying base LLM '{model_name}': {e}")
        return f"Error: {e}"

def evaluate_on_triviaqa(rag_chain, base_llm_name: str, dataset, output_path: Path):
    """Runs the evaluation loop specifically for the TriviaQA dataset."""
    results = []
    total_questions = len(dataset)
    logging.info(f"--- Starting Evaluation on TriviaQA ({total_questions} questions) ---")

    for i, item in enumerate(dataset):
        question = item['question']
        # TriviaQA provides a list of possible correct answers or aliases.        
        ground_truth_answers = item['answer']['aliases']

        # --- Query RAG System ---
        start_time = time.time()
        rag_response = rag_chain.invoke(question)
        rag_time = time.time() - start_time
        rag_answer = rag_response.get('answer', 'No answer found.')

        # --- Query Base LLM ---
        start_time = time.time()
        base_llm_answer = query_base_llm(question, base_llm_name)
        base_llm_time = time.time() - start_time

        # Append all relevant information for this question to the results list.        
        results.append({
            "question": question,
            "ground_truth_answers": ", ".join(ground_truth_answers),
            "rag_answer": rag_answer,
            "base_llm_answer": base_llm_answer,
            "rag_response_time_seconds": rag_time,
            "base_llm_response_time_seconds": base_llm_time
        })
        logging.info(f"Processed TriviaQA question {i + 1}/{total_questions}")

    # Convert the list of dictionaries into a pandas DataFrame and save to CSV.
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding="utf-8-sig")
    logging.info(f"TriviaQA evaluation complete. Results saved to '{output_path}'")

def evaluate_on_truthfulqa(rag_chain, base_llm_name: str, dataset, output_path: Path):
    """Runs the evaluation loop specifically for the TruthfulQA dataset."""
    results = []
    total_questions = len(dataset)
    logging.info(f"--- Starting Evaluation on TruthfulQA ({total_questions} questions) ---")

    for i, item in enumerate(dataset):
        question = item['question']
        # TruthfulQA provides distinct lists of correct and incorrect answers.
        correct_answers = item['correct_answers']
        incorrect_answers = item['incorrect_answers']

        # --- Query RAG System ---
        start_time = time.time()
        rag_response = rag_chain.invoke(question)
        rag_time = time.time() - start_time
        rag_answer = rag_response.get('answer', 'No answer found.')

        # --- Query Base LLM ---
        start_time = time.time()
        base_llm_answer = query_base_llm(question, base_llm_name)
        base_llm_time = time.time() - start_time
        
        results.append({
            "question": question,
            "correct_answers": ", ".join(correct_answers),
            "incorrect_answers": ", ".join(incorrect_answers),
            "rag_answer": rag_answer,
            "base_llm_answer": base_llm_answer,
            "rag_response_time_seconds": rag_time,
            "base_llm_response_time_seconds": base_llm_time
        })
        logging.info(f"Processed TruthfulQA question {i + 1}/{total_questions}")
        
    # Save the final results to a CSV file.
    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding="utf-8-sig")
    logging.info(f"TruthfulQA evaluation complete. Results saved to '{output_path}'")

In [5]:
# ==================================================================================================
# --- 4. Main Execution Block ---
# ==================================================================================================

def main():
    """Main function to orchestrate the general knowledge evaluation."""
    total_start_time = time.time()
    
    # --- General Knowledge Evaluation Settings ---
    # Define parameters specific to this evaluation, such as the baseline model and dataset size.
    BASE_MODEL_NAME = config.OLLAMA_BASE_MODEL
    DATASET_SUBSET_SIZE = 100
    OUTPUT_DIR = project_root / "evaluation_results" 
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    TRIVIAQA_OUTPUT_FILE = OUTPUT_DIR / "triviaqa_generated_answers.csv"
    TRUTHFULQA_OUTPUT_FILE = OUTPUT_DIR / "truthfulqa_generated_answers.csv"

    try:
        # --- Initialization ---
        # Set up the entire RAG pipeline. The '_' placeholders are used to ignore the
        # memory and vector_db objects, as only the chain is needed for this evaluation.
        logging.info("Initializing RAG system for general knowledge evaluation...")
        rag_chain, _, _ = setup_chatbot(config)
        logging.info("RAG system initialization complete.")

        # --- Dataset Loading ---
        # Load a small, manageable subset of questions from the Hugging Face Hub.
        logging.info(f"Loading first {DATASET_SUBSET_SIZE} questions from datasets...")
        trivia_qa_dataset = load_dataset("trivia_qa", "rc.nocontext", split=f"validation[:{DATASET_SUBSET_SIZE}]")
        truthful_qa_dataset = load_dataset("truthful_qa", "generation", split=f"validation[:{DATASET_SUBSET_SIZE}]")
        logging.info("Datasets loaded.")

        # --- Run Evaluations ---
        # Execute the evaluation functions for each dataset sequentially.
        evaluate_on_triviaqa(rag_chain, BASE_MODEL_NAME, trivia_qa_dataset, TRIVIAQA_OUTPUT_FILE)
        evaluate_on_truthfulqa(rag_chain, BASE_MODEL_NAME, truthful_qa_dataset, TRUTHFULQA_OUTPUT_FILE)

        total_end_time = time.time()
        logging.info(f"All evaluations finished successfully in {total_end_time - total_start_time:.2f} seconds.")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the evaluation: {e}")

In [None]:
if __name__ == "__main__":
    main()