## Create the dataset which is later used for evaluation

In [1]:
# ==================================================================================================
# --- 1. Import Dependencies ---
# ==================================================================================================
import pandas as pd
import json
import time
import sys
import logging
from pathlib import Path
from types import SimpleNamespace

# --- Define Project Root and Add to Python Path ---
project_code_root = Path.cwd().parent.resolve()

if str(project_code_root) not in sys.path:
        sys.path.append(str(project_code_root))
    
from core.rag_setup import setup_chatbot


In [2]:
# ==================================================================================================
# --- 2. Logging Configuration ---
# ==================================================================================================
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [3]:
# ==================================================================================================
# --- 3. Core Evaluation Functions ---
# ==================================================================================================

def run_evaluation_pipeline(chain, dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Processes each question in the dataset using the RAG chain and collects results.

    Args:
        chain: The initialized ConversationalRetrievalChain object.
        dataset (pd.DataFrame): The DataFrame containing the ground truth questions and answers.

    Returns:
        pd.DataFrame: A DataFrame with the evaluation results.
    """
    results = []
    total_questions = len(dataset)
    logging.info(f"Starting evaluation for {total_questions} questions...")

    for index, row in dataset.iterrows():
        try:
            # The source CSV contains JSON strings; they must be parsed.
            question_json = json.loads(row['question'])
            ground_truth_json = json.loads(row['expected_answer'])

            question_text = question_json['question']
            ground_truth_answer_text = ground_truth_json['expected_answer']

            # --- Query RAG System and Measure Latency ---
            start_time = time.time()
            response = chain.invoke(question_text)
            end_time = time.time()
            response_time = end_time - start_time

            # --- Extract and Structure Data ---
            generated_answer = response.get('answer', 'Error: No answer found.')
            retrieved_context = " ".join([doc.page_content for doc in response.get('source_documents', [])])

            results.append({
                "question": question_text,
                "ground_truth_answer": ground_truth_answer_text,
                "generated_answer": generated_answer,
                "retrieved_context": retrieved_context,
                "response_time_seconds": response_time
            })
            logging.info(f"Processed question {index + 1}/{total_questions} in {response_time:.2f} seconds")

        except Exception as e:
            logging.error(f"Failed to process question at index {index}: {e}")
            # Append a row indicating the error to maintain dataset alignment.
            results.append({
                "question": question_text,
                "ground_truth_answer": ground_truth_answer_text,
                "generated_answer": f"ERROR: {e}",
                "retrieved_context": "",
                "response_time_seconds": 0
            })

    return pd.DataFrame(results)

In [4]:
def main():
    """
    Main function to orchestrate the entire evaluation process.
    """
    project_root = Path.cwd().parent.parent.resolve()
    total_start_time = time.time()
    
    
    # --- Evaluation-Specific Configuration ---
    # This configuration object is passed to the unified backend. It allows us to
    # specify different paths and parameters for evaluation without changing the core code.
    eval_config = SimpleNamespace(
        BASE_DIR=project_root,
        RAW_DOC_FOLDER=project_root / "data" / "raw_documents",
        PROCESSED_DOC_FOLDER=project_root / "data" / "processed_output", # Using a distinct folder for eval if needed
        PERSIST_DIRECTORY=project_root / "chroma_db", # Pointing to the specific DB for this eval
        PREPROCESSING_SCRIPT_NAME=project_root / "code" / "core" / "pre-processing.py",
        MODEL_NAME="MLME_Chatbot",
        EMBEDDING_MODEL="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        RERANKER_MODEL="mixedbread-ai/mxbai-rerank-xsmall-v1",
        OLLAMA_METADATA_MODEL="gemma3:4b",
        VECTOR_STORE_NAME="rag_store_MLME",
        CHUNK_SIZE=1250,
        OVERLAP_SIZE=250,
        INITIAL_RETRIEVAL_K=6,
        TOP_N_RERANKED=3
    )

    # Define input and output paths for this evaluation run.
    input_csv_path = project_root / "evaluation_data" / "ground_truth_dataset.csv"
    output_csv_path = project_root / "evaluation_results" / "rag_evaluation_generated_answers.csv"
    output_csv_path.parent.mkdir(parents=True, exist_ok=True) # Ensure output directory exists

    try:
        # --- Initialization ---
        logging.info("Initializing RAG system with evaluation configuration...")
        chain, _, _ = setup_chatbot(eval_config)
        logging.info("RAG system initialization complete.")

        # --- Data Loading ---
        logging.info(f"Loading ground truth dataset from: {input_csv_path}")
        qa_dataset = pd.read_csv(input_csv_path)

        # --- Pipeline Execution ---
        evaluation_results_df = run_evaluation_pipeline(chain, qa_dataset)

        # --- Save Results ---
        logging.info(f"Saving evaluation results to: {output_csv_path}")
        evaluation_results_df.to_csv(output_csv_path, index=False, encoding="utf-8-sig")
        
        total_end_time = time.time()
        logging.info(f"Evaluation script finished successfully in {total_end_time - total_start_time:.2f} seconds.")

    except FileNotFoundError as e:
        logging.error(f"File not found: {e}. Please ensure all paths are correct.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during the evaluation: {e}")

In [None]:
if __name__ == "__main__":
    main()

2025-08-19 19:28:40 - [INFO] - Initializing RAG system with evaluation configuration...
2025-08-19 19:28:40 - [INFO] - --- Starting Full RAG Chatbot System Setup ---
2025-08-19 19:28:40 - [INFO] - Executing external pre-processing script: C:\Master's_Thesis\code\core\pre-processing.py...
2025-08-19 19:33:15 - [INFO] - Pre-processing script completed successfully.
2025-08-19 19:28:43,700 - [INFO] - Successfully connected to Ollama and found model 'gemma3:4b'.
2025-08-19 19:28:43,701 - [INFO] - --- Starting processing for: CRISP_Paper.pdf ---
2025-08-19 19:28:45,254 - [INFO] - Generating LLM-based metadata for CRISP_Paper.pdf...
2025-08-19 19:28:52,014 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:28:52,014 - [INFO] - Successfully generated metadata for CRISP_Paper.pdf.
2025-08-19 19:28:52,016 - [INFO] - Split CRISP_Paper.pdf into 34 chunks.
2025-08-19 19:28:52,016 - [INFO] - --- Finished processing for: CRISP_Paper.pdf ---
2025-08-19 19:28

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:33:32 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:33:36 - [INFO] - Processed question 1/110 in 6.22 seconds
2025-08-19 19:33:36 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:33:41 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:33:45 - [INFO] - Processed question 2/110 in 8.95 seconds
2025-08-19 19:33:45 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:33:50 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:33:56 - [INFO] - Processed question 3/110 in 11.48 seconds
2025-08-19 19:33:57 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:02 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:04 - [INFO] - Processed question 4/110 in 8.01 seconds
2025-08-19 19:34:05 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:09 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:11 - [INFO] - Processed question 5/110 in 6.62 seconds
2025-08-19 19:34:12 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:15 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:20 - [INFO] - Processed question 6/110 in 8.72 seconds
2025-08-19 19:34:21 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:25 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:29 - [INFO] - Processed question 7/110 in 9.65 seconds
2025-08-19 19:34:30 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:34 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:40 - [INFO] - Processed question 8/110 in 10.62 seconds
2025-08-19 19:34:41 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:44 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:34:51 - [INFO] - Processed question 9/110 in 10.69 seconds
2025-08-19 19:34:52 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-08-19 19:34:56 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
2025-08-19 19:35:00 - [INFO] - Processed question 10/110 in 9.80 seconds
2025-08-19 19:35:02 - [INFO] - HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
