## 1. Install the llama stack client

In [None]:
%pip install llama_stack

## 2. List available models

In [None]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(base_url="http://llama-test-milvus-kserve-service:8321")
client.models.list()

## 3. Register your Milvus vector database with LlamaStack

In [None]:
from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient

vector_db_id = "my_demo_vector_id"
client = LlamaStackClient(base_url="http://llama-test-milvus-kserve-service:8321")

models = client.models.list()

# Select the first LLM and first embedding models
model_id = next(m for m in models if m.model_type == "llm").identifier
embedding_model_id = (
    em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]

_ = client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_id,
    embedding_dimension=embedding_dimension,
    provider_id="milvus",
)

client.vector_dbs.list()

## 4. Import and run the KubeFlow Pipeline
Import the "[docling_convert_pipeline_compiled.yaml](./docling_convert_pipeline_compiled.yaml)" KubeFlow Pipeline into your pipeline server, then run the pipeline to insert your PDF documents into the vector database.

When running the pipeline, you can customize the following parameters:

- `base_url`: Base URL to fetch PDF files from
- `pdf_filenames`: Comma-separated list of PDF filenames to download and convert
- `num_workers`: Number of parallel workers
- `vector_db_id`: Milvus vector database ID
- `service_url`: Milvus service URL
- `embed_model_id`: Embedding model to use
- `max_tokens`: Maximum tokens per chunk
- `use_gpu`: Enable/disable GPU acceleration

Note: The compiled pipeline was generated by running `python docling_convert_pipeline.py`.

## 5. Prompt the LLM
Prompt the LLM with a question in relation to the documents inserted, and see it return accurate answers.

In [None]:
from llama_stack_client import Agent, AgentEventLogger
import uuid

rag_agent = Agent(
    client,
    model=model_id,
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }
    ],
)

prompt = "What can you tell me about the birth of word processing?"
print("prompt>", prompt)

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

response = rag_agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=session_id,
    stream=True,
)

for log in AgentEventLogger().log(response):
    log.print()

Or Query chunks from a vector database.

In [None]:
query_result = client.vector_io.query(
    vector_db_id=vector_db_id,
    query="what do you know about?",
)
print(query_result)

### Congratulations! You've successfully inserted your PDF documents via a KubeFlow Pipeline, and queried your RAG application using Llama Stack! 🎉🥳