In [None]:
"""
This script conducts a comparative evaluation of a Retrieval-Augmented Generation 
(RAG) system against a baseline Large Language Model (LLM). It assesses their 
performance on two distinct general knowledge question-answering datasets: 
TriviaQA and TruthfulQA.

The script queries both the RAG system and the base LLM with a subset of 
questions from each dataset, records their answers and response times, and saves
the comprehensive results into two separate CSV files for subsequent analysis.
"""

In [10]:
# ==================================================================================================
# --- 1. Import Dependencies ---
# ==================================================================================================
import pandas as pd
import time
import ollama
import logging
import sys
from pathlib import Path
from types import SimpleNamespace
from datasets import load_dataset

# --- Define Project Root and Add to Python Path ---
# This allows the script to find and import the 'core' package.
# --- Define Project Root and Add to Python Path ---
project_code_root = Path.cwd().parent.resolve()
project_root = Path.cwd().parent.parent.resolve()

if str(project_code_root) not in sys.path:
        sys.path.append(str(project_code_root))
    
from core.rag_setup import setup_chatbot

In [11]:
print(project_code_root)
print(project_root)

C:\Master's_Thesis\code
C:\Master's_Thesis


In [12]:
# ==================================================================================================
# --- 2. Logging Configuration ---
# ==================================================================================================
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)


In [13]:
# ==================================================================================================
# --- 3. Core Evaluation Functions ---
# ==================================================================================================

def query_base_llm(question: str, model_name: str) -> str:
    """Sends a question directly to a specified baseline LLM via Ollama."""
    try:
        response = ollama.chat(
            model=model_name,
            messages=[{'role': 'user', 'content': question}]
        )
        return response['message']['content']
    except Exception as e:
        logging.error(f"Error querying base LLM '{model_name}': {e}")
        return f"Error: {e}"

def evaluate_on_triviaqa(rag_chain, base_llm_name: str, dataset, output_path: Path):
    """Runs the evaluation loop specifically for the TriviaQA dataset."""
    results = []
    total_questions = len(dataset)
    logging.info(f"--- Starting Evaluation on TriviaQA ({total_questions} questions) ---")

    for i, item in enumerate(dataset):
        question = item['question']
        ground_truth_answers = item['answer']['aliases']

        # --- Query RAG System ---
        start_time = time.time()
        rag_response = rag_chain.invoke(question)
        rag_time = time.time() - start_time
        rag_answer = rag_response.get('answer', 'No answer found.')

        # --- Query Base LLM ---
        start_time = time.time()
        base_llm_answer = query_base_llm(question, base_llm_name)
        base_llm_time = time.time() - start_time
        
        results.append({
            "question": question,
            "ground_truth_answers": ", ".join(ground_truth_answers),
            "rag_answer": rag_answer,
            "base_llm_answer": base_llm_answer,
            "rag_response_time_seconds": rag_time,
            "base_llm_response_time_seconds": base_llm_time
        })
        logging.info(f"Processed TriviaQA question {i + 1}/{total_questions}")

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding="utf-8-sig")
    logging.info(f"TriviaQA evaluation complete. Results saved to '{output_path}'")

def evaluate_on_truthfulqa(rag_chain, base_llm_name: str, dataset, output_path: Path):
    """Runs the evaluation loop specifically for the TruthfulQA dataset."""
    results = []
    total_questions = len(dataset)
    logging.info(f"--- Starting Evaluation on TruthfulQA ({total_questions} questions) ---")

    for i, item in enumerate(dataset):
        question = item['question']
        correct_answers = item['correct_answers']
        incorrect_answers = item['incorrect_answers']

        # --- Query RAG System ---
        start_time = time.time()
        rag_response = rag_chain.invoke(question)
        rag_time = time.time() - start_time
        rag_answer = rag_response.get('answer', 'No answer found.')

        # --- Query Base LLM ---
        start_time = time.time()
        base_llm_answer = query_base_llm(question, base_llm_name)
        base_llm_time = time.time() - start_time
        
        results.append({
            "question": question,
            "correct_answers": ", ".join(correct_answers),
            "incorrect_answers": ", ".join(incorrect_answers),
            "rag_answer": rag_answer,
            "base_llm_answer": base_llm_answer,
            "rag_response_time_seconds": rag_time,
            "base_llm_response_time_seconds": base_llm_time
        })
        logging.info(f"Processed TruthfulQA question {i + 1}/{total_questions}")

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False, encoding="utf-8-sig")
    logging.info(f"TruthfulQA evaluation complete. Results saved to '{output_path}'")

In [14]:
# ==================================================================================================
# --- 4. Main Execution Block ---
# ==================================================================================================

def main():
    """Main function to orchestrate the general knowledge evaluation."""
    total_start_time = time.time()

    # --- Evaluation-Specific Configuration ---
    eval_config = SimpleNamespace(
        BASE_DIR=project_root,
        RAW_DOC_FOLDER=project_root / "data" / "raw_documents",
        PROCESSED_DOC_FOLDER=project_root / "data" / "processed_output",
        PERSIST_DIRECTORY=project_root / "chroma_db",
        PREPROCESSING_SCRIPT_NAME=project_root / "code" / "core" / "pre-processing.py",
        MODEL_NAME="MLME_chatbot",
        EMBEDDING_MODEL="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        RERANKER_MODEL="mixedbread-ai/mxbai-rerank-xsmall-v1",
        OLLAMA_METADATA_MODEL="gemma3:4b",
        VECTOR_STORE_NAME="rag_store_MLME",
        CHUNK_SIZE=1250,
        OVERLAP_SIZE=250,
        INITIAL_RETRIEVAL_K=6,
        TOP_N_RERANKED=3,
        METADATA_GENERATION_CHAR_LIMIT=40000
    )
    
    # --- General Knowledge Evaluation Settings ---
    BASE_MODEL_NAME = 'gemma3:4b'
    DATASET_SUBSET_SIZE = 100
    OUTPUT_DIR = project_root / "evaluation_results" / "general_knowledge"
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    TRIVIAQA_OUTPUT_FILE = OUTPUT_DIR / "triviaqa_evaluation.csv"
    TRUTHFULQA_OUTPUT_FILE = OUTPUT_DIR / "truthfulqa_evaluation.csv"

    try:
        # --- Initialization ---
        logging.info("Initializing RAG system for general knowledge evaluation...")
        rag_chain, _, _ = setup_chatbot(eval_config)
        logging.info("RAG system initialization complete.")

        # --- Dataset Loading ---
        logging.info(f"Loading first {DATASET_SUBSET_SIZE} questions from datasets...")
        trivia_qa_dataset = load_dataset("trivia_qa", "rc.nocontext", split=f"validation[:{DATASET_SUBSET_SIZE}]", trust_remote_code=True)
        truthful_qa_dataset = load_dataset("truthful_qa", "generation", split=f"validation[:{DATASET_SUBSET_SIZE}]")
        logging.info("Datasets loaded.")

        # --- Run Evaluations ---
        evaluate_on_triviaqa(rag_chain, BASE_MODEL_NAME, trivia_qa_dataset, TRIVIAQA_OUTPUT_FILE)
        evaluate_on_truthfulqa(rag_chain, BASE_MODEL_NAME, truthful_qa_dataset, TRUTHFULQA_OUTPUT_FILE)

        total_end_time = time.time()
        logging.info(f"All evaluations finished successfully in {total_end_time - total_start_time:.2f} seconds.")

    except Exception as e:
        logging.error(f"An unexpected error occurred during the evaluation: {e}")

In [None]:
if __name__ == "__main__":
    main()

2025-08-19 19:50:01 - [INFO] - Initializing RAG system for general knowledge evaluation...
2025-08-19 19:50:01 - [INFO] - --- Starting Full RAG Chatbot System Setup ---
2025-08-19 19:50:01 - [INFO] - Executing external pre-processing script: C:\Master's_Thesis\code\core\pre-processing.py...
2025-08-19 19:50:04 - [INFO] - Pre-processing script completed successfully.
2025-08-19 19:50:04,767 - [INFO] - Successfully connected to Ollama and found model 'gemma3:4b'.
2025-08-19 19:50:04,768 - [INFO] - Output for 'CRISP_Paper.pdf' already exists. Skipping.
2025-08-19 19:50:04,768 - [INFO] - Output for 'Ethics_Discrimination_algo_bias.pdf' already exists. Skipping.
2025-08-19 19:50:04,768 - [INFO] - Output for 'Feedback Loops in Machine Learning_ A Study on the Interplay of.pdf' already exists. Skipping.
2025-08-19 19:50:04,768 - [INFO] - Output for 'lecture_01.pdf' already exists. Skipping.
2025-08-19 19:50:04,768 - [INFO] - Output for 'lecture_02.pdf' already exists. Skipping.
2025-08-19 19:

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

2025-08-19 19:50:29 - [INFO] - Datasets loaded.
2025-08-19 19:50:29 - [INFO] - --- Starting Evaluation on TriviaQA (100 questions) ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:50:47 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:10 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:10 - [INFO] - Processed TriviaQA question 1/100
2025-08-19 19:51:10 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:51:13 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:16 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:16 - [INFO] - Processed TriviaQA question 2/100
2025-08-19 19:51:17 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:51:20 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:24 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:24 - [INFO] - Processed TriviaQA question 3/100
2025-08-19 19:51:24 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:51:28 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:32 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:51:32 - [INFO] - Processed TriviaQA question 4/100
2025-08-19 19:51:32 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:51:35 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
