## 1. Create Llama Stack client, list available models and vector databases

In [1]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://lsd-llama-milvus-service:8321")

models = client.models.list()
print(f"Models information: {models}\n")

inference_llm = next((model.identifier for model in models if model.model_type == 'llm'), None)
print(f"Identifier for Inference model in usage: {inference_llm}\n")

# Check what vector databases exist
print("=== Available Vector Databases ===")
vector_dbs = client.vector_dbs.list()
if vector_dbs:
    for vdb in vector_dbs:
        print(f"- ID: {vdb.identifier}")
        print(f"  Provider: {vdb.provider_id}")
        print(f"  Embedding Model: {vdb.embedding_model}")
        print()
else:
    print("No vector databases found!")

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/models "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/vector-dbs "HTTP/1.1 200 OK"


Models information: [Model(identifier='vllm', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='vllm', model_type='llm'), Model(identifier='ibm-granite/granite-embedding-125m-english', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]

Identifier for Inference model in usage: vllm

=== Available Vector Databases ===
- ID: my_demo_image_ocr_vector_id
  Provider: milvus
  Embedding Model: ibm-granite/granite-embedding-125m-english

- ID: demo_db
  Provider: milvus
  Embedding Model: ibm-granite/granite-embedding-125m-english

- ID: ocr-vector-db
  Provider: milvus
  Embedding Model: ibm-granite/granite-embedding-125m-english



## 2. Create RAG Agent and prompt the LLM
Prompt the LLM with questions in relation to the documents inserted, and see it return accurate answers.

In [None]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model="vllm",
    instructions="You are a helpful assistant. Answer the user's question based only on the provided search results. Respond with 'I don’t know' if the information is outside of the scope of your knowledge and not present in the search results.",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": ["ocr-vector-db"]},
        }
    ],
)


user_prompts = [
    "List RAG key market use cases",
    "Describe the sequence of steps of the Ingestion Flow",
    "What is the state of Irish economy at 2025?", # Dummy question to show that Agent is allowed to respond only if question relates to the uploaded data
]

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

for prompt in user_prompts:
    print("prompt>", prompt)
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
        stream=True,
    )
    for log in AgentEventLogger().log(response):
        log.print()

session_response = client.agents.session.retrieve(
    session_id=session_id,
    agent_id=rag_agent.agent_id,
)

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/a56ac9ff-2d7b-4beb-ae47-a2e5b08b6ae1/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/a56ac9ff-2d7b-4beb-ae47-a2e5b08b6ae1/session/960bb726-d8df-40e6-9c9a-c8244e04268d/turn "HTTP/1.1 200 OK"


prompt> List RAG key market use cases
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG key market use cases'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\n- Knowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\n- Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.\n- Recommendation Systems: Enhancing recommendations by providing relevant context.\n- Customer Service: Improving support accuracy with access to current product information.\n- Personal Assistants: Enabling more comprehensive and accurate information from Al assistants .\n- Multi-hop Question Answering: Handl

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/a56ac9ff-2d7b-4beb-ae47-a2e5b08b6ae1/session/960bb726-d8df-40e6-9c9a-c8244e04268d/turn "HTTP/1.1 200 OK"


prompt> Describe the sequence of steps of the Ingestion Flow
[33minference> [0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'RAG Ingestion Flow sequence of steps'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\n- Knowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\n- Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.\n- Recommendation Systems: Enhancing recommendations by providing relevant context.\n- Customer Service: Improving support accuracy with access to current product information.\n- Personal Assistants: Enabling more comprehensive and accurate information from Al assistants .\n- 

INFO:httpx:HTTP Request: POST http://lsd-llama-milvus-service:8321/v1/agents/a56ac9ff-2d7b-4beb-ae47-a2e5b08b6ae1/session/960bb726-d8df-40e6-9c9a-c8244e04268d/turn "HTTP/1.1 200 OK"


prompt> What is the state of Irish eceonomy at 2025?
[33minference> [0m[33mI[0m[33m don[0m[33m’t[0m[33m know[0m[33m what[0m[33m the[0m[33m state[0m[33m of[0m[33m the[0m[33m Irish[0m[33m economy[0m[33m is[0m[33m in[0m[33m [0m[33m202[0m[33m5[0m[33m.[0m[97m[0m
[30m[0m

INFO:httpx:HTTP Request: GET http://lsd-llama-milvus-service:8321/v1/agents/a56ac9ff-2d7b-4beb-ae47-a2e5b08b6ae1/session/960bb726-d8df-40e6-9c9a-c8244e04268d "HTTP/1.1 200 OK"


## 3. Preparation for evaluating RAG models using [RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/?h=metrics)

- We will use two key metrics to show the performance of the RAG server:
    1. [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/) - measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
    2. [Response Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/) - metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information.

 - Create .env.dev file and paste there your API Key from [Groq Cloud](https://console.groq.com/home)

In [6]:
from dotenv import load_dotenv

with open(".env.dev", "w") as f:
    f.write('GROQ_API_KEY=PASTE_YOUR_GROQ_API_KEY')

# load env variable
load_dotenv(dotenv_path=".env.dev", override=True)

True

In [7]:
import re
from typing import List
from llama_stack_client.types.agents import Turn

# Compile regex pattern once for better performance
CONTENT_PATTERN = re.compile(r"Content:\s*(.*?)(?=\nMetadata:|$)", re.DOTALL)

# This function extracts the search results for the trace of each query
def extract_retrieved_contexts(turn_object: Turn) -> List[str]:
    """
    Extracts retrieved contexts from LlamaStack tool execution responses.
    
    Args:
        turn_object: A Turn object from LlamaStack containing steps with tool responses
        
    Returns:
        List of retrieved context strings for Ragas evaluation
    """
    retrieved_context = []

    # Filter tool execution steps first to reduce iterations
    tool_steps = [step for step in turn_object.steps if step.step_type == "tool_execution"]
    
    for step in tool_steps:
        for response in step.tool_responses:
            if not response.content or not isinstance(response.content, list):
                continue
                
            # Process all valid text items at once
            text_items = [
                item.text for item in response.content 
                if (hasattr(item, "text") and hasattr(item, "type") and 
                    item.type == "text" and item.text and 
                    item.text.startswith("Result ") and "Content:" in item.text)
            ]
            
            # Extract content from all valid texts
            for text in text_items:
                match = CONTENT_PATTERN.search(text)
                if match:
                    retrieved_context.append(match.group(1).strip())

    return retrieved_context

In [11]:
from ragas.dataset_schema import EvaluationDataset

samples = []

references = ['''
1. Knowledge Question Answering: Providing accurate answers in customer service, product manuals, or FAQs.
2. Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.
3. Recommendation Systems: Enhancing recommendations by providing relevant context.
4. Customer Service: Improving support accuracy with access to current product information.
5. Personal Assistants: Enabling more comprehensive and accurate information from AI assistants.
6. Multi-hop Question Answering: Handling complex, multi-step questions through iterative retrieval.
7. Legal Applications: Retrieving legal documents and case law for reliable legal opinions.
8. General Task Assistance: Aiding users in various tasks requiring information access and decision-making.''', 
'''
Ingestion Flow:
1. Document Upload (via UI/API)
2. Docling (Chunking + Metadata)
3. Embedding (Granite Embedding via vLLM)
4. Milvus (Vector DB)
'''
]

# Constructing a Ragas EvaluationDataset
for i, turn in enumerate(session_response.turns[:2]):
    samples.append(
        {
            "user_input": turn.input_messages[0].content,
            "response": turn.output_message.content,
            "reference": references[i],
            "retrieved_contexts": extract_retrieved_contexts(turn),
        }
    )

ragas_eval_dataset = EvaluationDataset.from_list(samples)
ragas_eval_dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,reference
0,List RAG key market use cases,[Market Use Cases Key\nRAG is being adopted ac...,RAG key market use cases include:\n\n* Knowled...,\n1. Knowledge Question Answering: Providing a...
1,Describe the sequence of steps of the Ingestio...,[Market Use Cases Key\nRAG is being adopted ac...,The sequence of steps in the Ingestion Flow of...,\nIngestion Flow:\n1. Document Upload (via UI/...


## 4. Prerequisites for RAG evaluation

In [12]:
from ragas.metrics import (
    Faithfulness, 
    ResponseRelevancy,
) 
from ragas.dataset_schema import SingleTurnSample 
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(
    model="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0,
)

# Wrap the Groq LLM for use with Ragas
evaluator_llm = LangchainLLMWrapper(llm)

# Using HuggingFace embeddings as a free alternative
embeddings_model = HuggingFaceEmbeddings(
    model_name="ibm-granite/granite-embedding-125m-english"
)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings_model)


# references for both prompts
reference_for_first_prompt = samples[0]["reference"]
reference_for_second_prompt = samples[1]["reference"]

# inputs for both prompts
user_input_for_first_prompt = samples[0]["user_input"]
user_input_for_second_prompt = samples[1]["user_input"]

# responses for both prompts
response_for_first_prompt = samples[0]["response"]
response_for_second_prompt = samples[1]["response"]

# reference lists for both prompts
reference_list_for_first_prompt = [line.strip() for line in reference_for_first_prompt.strip().split('\n')]
reference_list_for_second_prompt = [line.strip() for line in reference_for_second_prompt.strip().split('\n')]

# Retrieved contexts for both prompts
retrieved_contexts_for_first_prompt = samples[0]["retrieved_contexts"]
retrieved_contexts_for_second_prompt = samples[1]["retrieved_contexts"]

print(f"Retrieved contexts for the first prompt: {retrieved_contexts_for_first_prompt}\n")
print(f"Retrieved contexts for the second prompt: {retrieved_contexts_for_second_prompt}\n")

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ibm-granite/granite-embedding-125m-english


Retrieved contexts for the first prompt: ['Market Use Cases Key\nRAG is being adopted across various industries for diverse applications; including:\n- Knowledge Question Answering: Providing accurate answers in customer service product manuals or FAQs. using\n- Code Generation: Retrieving relevant code snippets and documentation to assist in code creation.\n- Recommendation Systems: Enhancing recommendations by providing relevant context.\n- Customer Service: Improving support accuracy with access to current product information.\n- Personal Assistants: Enabling more comprehensive and accurate information from Al assistants .\n- Multi-hop Question Answering: Handling complex; multi-step questions through iterative retrieval.\n- Legal Applications: Retrieving legal documents and case law for reliable legal opinions.\n- General Task Assistance: Aiding users in various tasks requiring information access and decision-making:\nThe rising demand for hyper-personalized content in areas like m

## 5. Evaluate Faithfulness Score for both prompts

In [13]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
faithfulness_scorer = Faithfulness(llm=evaluator_llm)
faithfulness_score_for_first_prompt = await faithfulness_scorer.single_turn_ascore(first_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[0]}': {faithfulness_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'List RAG key market use cases': 1.0


In [14]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
faithfulness_score_for_second_prompt = await faithfulness_scorer.single_turn_ascore(second_prompt_turn)
print(f"Faithfulness score for prompt '{user_prompts[1]}': {faithfulness_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
INFO:groq._base_client:Retrying request to /openai/v1/chat/completions in 14.000000 seconds
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Faithfulness score for prompt 'Describe the sequence of steps of the Ingestion Flow': 0.8947368421052632


## 6. Evaluate Response Relevancy for both prompts

In [15]:
first_prompt_turn = SingleTurnSample(
        user_input=user_input_for_first_prompt,
        response=response_for_first_prompt,
        retrieved_contexts=retrieved_contexts_for_first_prompt,
    )
response_relevancy_scorer = ResponseRelevancy(llm=evaluator_llm, embeddings=evaluator_embeddings)
response_relevancy_score_for_first_prompt = await response_relevancy_scorer.single_turn_ascore(first_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[0]}': {response_relevancy_score_for_first_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'List RAG key market use cases': 0.9634198534980678


In [16]:
second_prompt_turn = SingleTurnSample(
        user_input=user_input_for_second_prompt,
        response=response_for_second_prompt,
        retrieved_contexts=retrieved_contexts_for_second_prompt,
    )
response_relevancy_score_for_second_prompt = await response_relevancy_scorer.single_turn_ascore(second_prompt_turn)
print(f"Response Relevancy score for prompt '{user_prompts[1]}': {response_relevancy_score_for_second_prompt}")

INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


Response Relevancy score for prompt 'Describe the sequence of steps of the Ingestion Flow': 0.9282959928630402
