# RAG from scratch

- Adapted from https://github.com/opendatahub-io/llama-stack-demos/
- Requires ollama template 

# Variables

The variables are configured related to the distribution we are connected

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
base_url=os.getenv("BASE_URL") #local "http://localhost:8321"
chunk_size_in_tokens=os.getenv("CHUNK_SIZE_TOKEN") #512
collection_name=os.getenv('COLLECTION_NAME')  ##all-MiniLM-L6-v2"
embedding_model_name=os.getenv('EMBEDDING_MODEL_NAME')  
embedding_size=int(os.getenv('EMBEDDING_SIZE'))   
model_name=os.getenv("MODEL_NAME") 

### Define client and test connection to server

In [2]:
from llama_stack_client import LlamaStackClient
client = LlamaStackClient(
    base_url=base_url,
)

In [3]:
client.models.list()

INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"


[Model(identifier='all-MiniLM-L6-v2', metadata={'embedding_dimension': 384.0}, api_model_type='embedding', provider_id='ollama', type='model', provider_resource_id='all-minilm:latest', model_type='embedding'),
 Model(identifier='llama3.2:3b-instruct-fp16', metadata={}, api_model_type='llm', provider_id='ollama', type='model', provider_resource_id='llama3.2:3b-instruct-fp16', model_type='llm')]

### Ingest data to vector db 

In [34]:
from llama_stack_client import RAGDocument

#urls = ["chat.rst", "llama3.rst", "memory_optimizations.rst", "lora_finetune.rst"]
urls = ["chat.rst", "llama3.rst"]
documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=f"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}",
        mime_type="text/plain",
        metadata={},
    )
    for i, url in enumerate(urls)
]

In [35]:
vector_db_id = collection_name
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model=embedding_model_name,
    embedding_dimension=embedding_size,
)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-dbs "HTTP/1.1 200 OK"


VectorDBRegisterResponse(embedding_dimension=384, embedding_model='all-MiniLM-L6-v2', identifier='demoInnovate-vectordb', provider_id='faiss', type='vector_db', provider_resource_id='demoInnovate-vectordb', owner={'principal': '', 'attributes': {}})

Ingesting documents into a vector database:

In [11]:
import rich
rich.print(documents)

In [12]:
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=chunk_size_in_tokens,
)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 200 OK"


## Testing vector db 

In [None]:
prompt = "What are the top 5 topics that were explained? Only list succinct bullet points."

### Using tool_runtime api that can query in multiple collection

In [None]:
# higher level tool provides packaged results, can span multiple dbs
tool_response = client.tool_runtime.rag_tool.query(
    content=prompt, vector_db_ids=[vector_db_id]
)
rich.print(tool_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"


In [33]:
help(client.tool_runtime.rag_tool.query)

Help on method query in module llama_stack_client.resources.tool_runtime.rag_tool:

query(*, content: 'InterleavedContent', vector_db_ids: 'List[str]', query_config: 'QueryConfig | NotGiven' = NOT_GIVEN, extra_headers: 'Headers | None' = None, extra_query: 'Query | None' = None, extra_body: 'Body | None' = None, timeout: 'float | httpx.Timeout | None | NotGiven' = NOT_GIVEN) -> 'QueryResult' method of llama_stack_client.resources.tool_runtime.rag_tool.RagToolResource instance
    Query the RAG system for context; typically invoked by the agent
    
    Args:
      content: A image content item
    
      query_config: Configuration for the RAG query generation.
    
      extra_headers: Send extra headers
    
      extra_query: Add additional query parameters to the request
    
      extra_body: Add additional JSON properties to the request
    
      timeout: Override the client-level default timeout for this request, in seconds



### Query the vector io collection directly

In [18]:
# we can also query the vector db directly
db_response = client.vector_io.query(
    vector_db_id=vector_db_id,
    query=prompt,
)
rich.print(db_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/query "HTTP/1.1 200 OK"


## Creating context joining all the search

In [20]:
# prompt_context = tool_response.content
prompt_context = "\n".join([c.content for c in db_response.chunks])

In [21]:
messages = [{"role": "system", "content": "You are a helpful assistant."}]
extended_prompt = f"""
Please answer the given query using the context below.

QUERY:
{prompt}

CONTEXT:
{prompt_context}
"""
messages.append({"role": "user", "content": extended_prompt})
rich.print(messages)

In [23]:
response = client.inference.chat_completion(
    messages=messages,
    model_id=model_name,
    timeout=600
)
rich.print(response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


### Using Agent 

In [24]:
# Left as an extra exercise for the reader
from llama_stack_client import Agent

rag_agent = Agent(
    client, 
    model=model_name,
    instructions="You are a helpful assistant",
    tools = [
        {
          "name": "builtin::rag/knowledge_search",
          "args" : {
            "vector_db_ids": [vector_db_id],
          }
        }
    ],
)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"


In [25]:
import uuid
from llama_stack_client.lib.agents.event_logger import EventLogger
user_prompts = [
    "What is Lora?"
]
session_id = rag_agent.create_session(f"rag session-{uuid.uuid4()}")
for prompt in user_prompts:
    rich.print(f"User> {prompt}")
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
    )
    for log in EventLogger().log(response):
        log.print()

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/0e7bbf74-c3bc-4f7a-9b35-789eb7d4095c/session "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/0e7bbf74-c3bc-4f7a-9b35-789eb7d4095c/session/606fcb14-b6b8-4c0c-b4cf-ace08c607135/turn "HTTP/1.1 200 OK"


[33minference> [0m[33m[k[0m[33mnowledge[0m[33m_search[0m[33m(query[0m[33m="[0m[33mL[0m[33mora[0m[33m")][0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Lora'}[0m
[33minference> [0m[33mHere[0m[33m is[0m[33m a[0m[33m rewritten[0m[33m version[0m[33m of[0m[33m the[0m[33m text[0m[33m in[0m[33m a[0m[33m more[0m[33m readable[0m[33m format[0m[33m:

[0m[33m**[0m[33mWhat[0m[33m is[0m[33m Lo[0m[33mRA[0m[33m?[0m[33m**

[0m[33mLo[0m[33mRA[0m[33m ([0m[33mLow[0m[33m-R[0m[33mank[0m[33m Adapt[0m[33mation[0m[33m)[0m[33m is[0m[33m an[0m[33m adapter[0m[33m-based[0m[33m method[0m[33m for[0m[33m parameter[0m[33m-efficient[0m[33m fin[0m[33met[0m[33muning[0m[33m that[0m[33m adds[0m[33m trainable[0m[33m low[0m[33m-r[0m[33mank[0m[33m decomposition[0m[33m matrices[0m[33m to[0m[33m different[0m[33m layers[0m[33m of[0m[33m a[0m[33m neural[0m[33m network[0m[33m