# 📘 Notebook 03: Evaluate RAGAS Performance

This notebook evaluates your RAG pipeline using the RAGAS metrics against a Qdrant vector store.

In [1]:
# ✅ 1. Setup & Imports
import os
import json
import sys
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness
from ragas import evaluate
from openai import OpenAI
from sentence_transformers import SentenceTransformer
import pandas as pd

from dotenv import load_dotenv

# Load environment variables
load_dotenv(dotenv_path="../backend/.env")

# Get Qdrant environment variables
COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "golf_shot_vectors")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "thenlper/gte-small")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

# Create client for the Qdrant vector store.
client = QdrantClient(
    url='https://6f592f43-f667-4234-ad3a-4f15ed5882ef.us-west-2-0.aws.cloud.qdrant.io:6333',
    api_key=QDRANT_API_KEY
)

MODEL_INSTANCE = SentenceTransformer(EMBEDDING_MODEL)

# Add the backend directory to the Python path
sys.path.append("../backend")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ✅ 2. Load Ground Truth Dataset
with open('../data/raw/golden_shot_dataset.json') as f:
    dataset = json.load(f)

In [3]:
# ✅ 3. Function to Query Qdrant and Retrieve Context


from tools.golf_shot_recommendations_tool import preprocess_query_with_llm
from qdrant_client.http.exceptions import UnexpectedResponse

def get_embedding(text):
    model = MODEL_INSTANCE
    if EMBEDDING_MODEL.startswith("intfloat"): # "e5" model type
        return model.encode(f"query: {text}")
    else:
        return model.encode(text)


def get_contexts(question, top_k=3):
    # Preprocess the query using LLM
    preprocessed_query = preprocess_query_with_llm(question)
    
    # Get embeddings for the preprocessed query
    model = MODEL_INSTANCE
    query_vector = get_embedding(preprocessed_query)
    
    # Search Qdrant
    try:
        results = client.query_points(
            collection_name=COLLECTION_NAME,
            query=query_vector,
            limit=top_k,
            with_payload=True
        )
    except UnexpectedResponse as e:
        if "Vector dimension error" in str(e):
            raise ValueError(
                f"Vector dimension mismatch! The current embedding model ({EMBEDDING_MODEL}) "
                f"produces vectors of a different dimension than what's expected by the Qdrant collection. "
                f"Please check your EMBEDDING_MODEL environment variable and ensure it matches the model "
                f"used to create the vectors in your Qdrant collection."
            ) from e
        raise  # Re-raise other UnexpectedResponse errors
    
    # Format the results
    recommendations = []
    for point in results.points:
        recommendations.append(f"Score: {point.score:.4f} | {point.payload['text']}")
    
    return recommendations


In [4]:
# ✅ 4. Generate Answers Using Your RAG Pipeline
from agents.golf_langgraph import summarize_result

def generate_answer(question, contexts):
    context_str = '\n'.join(contexts)
    state = {
        "input": question,
        "tool_result": context_str
    }
    
    # Use the summarize_result function to generate the answer
    answer = summarize_result(state)
    return answer.get("final_response")


[DEBUG] Adding node: search_golfpedia with RunnableLambda(run)
[DEBUG] Adding node: course_insights with RunnableLambda(run)
[DEBUG] Adding node: get_pro_stats with RunnableLambda(run)
[DEBUG] Adding node: get_shot_recommendations with RunnableLambda(run)
[DEBUG] All nodes in workflow: ['router', 'search_golfpedia', 'course_insights', 'get_pro_stats', 'get_shot_recommendations', 'summarize']
[DEBUG] Node 'router' is of type <class 'langgraph.graph.state.StateNodeSpec'> and value: StateNodeSpec(runnable=router(tags=None, recurse=True, explode_args=False, func_accepts_config=False, func_accepts={}), metadata=None, input=<class 'agents.golf_langgraph.AgentState'>, retry_policy=None, ends=())
[DEBUG] Node 'search_golfpedia' is of type <class 'langgraph.graph.state.StateNodeSpec'> and value: StateNodeSpec(runnable=RunnableLambda(run), metadata=None, input=<class 'agents.golf_langgraph.AgentState'>, retry_policy=None, ends=())
[DEBUG] Node 'course_insights' is of type <class 'langgraph.graph

In [5]:
# ✅ 5. Run Evaluation on Dataset
records = []

for entry in tqdm(dataset):
    q = entry['query']
    gt = entry['ideal_answer']
    ctx = get_contexts(q)
    ans = generate_answer(q, ctx)
    
    records.append({
        "user_input": q,
        "retrieved_contexts": ctx,
        "response": ans,
        "reference": gt
    })

  return ChatOpenAI(
100%|██████████| 40/40 [04:55<00:00,  7.39s/it]


In [7]:
# ✅ 6. Evaluate with RAGAS
# df = pd.DataFrame(records)

# Convert your records to the required schema
ragas_records = [
    {
        "user_input": r["user_input"],
        "retrieved_contexts": r["retrieved_contexts"],
        "response": r["response"],
        "reference": r["reference"],
    }
    for r in records
]

from ragas.evaluation import EvaluationDataset

dataset = EvaluationDataset.from_list(ragas_records)

ragas_results = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall, answer_correctness]
)
print(ragas_results)

Evaluating: 100%|██████████| 200/200 [03:18<00:00,  1.01it/s]


{'faithfulness': 0.4304, 'answer_relevancy': 0.0436, 'context_precision': 0.4458, 'context_recall': 0.4125, 'answer_correctness': 0.3816}
