# RAG from scratch

- Adapted from https://github.com/opendatahub-io/llama-stack-demos/
- Requires ollama template 

# Variables

In [18]:
base_url="http://localhost:8321"
chunk_size_in_tokens=512


In [1]:
import rich
from llama_stack_client import LlamaStackClient, RAGDocument


In [13]:
client = LlamaStackClient(
    base_url=base_url,
)

In [14]:
client.models.list()

INFO:httpx:HTTP Request: GET http://localhost:8321/v1/models "HTTP/1.1 200 OK"


[Model(identifier='llama3.2:3b-instruct-fp16', metadata={}, api_model_type='llm', provider_id='ollama', type='model', provider_resource_id='llama3.2:3b-instruct-fp16', model_type='llm'),
 Model(identifier='all-MiniLM-L6-v2', metadata={'embedding_dimension': 384.0}, api_model_type='embedding', provider_id='ollama', type='model', provider_resource_id='all-minilm:latest', model_type='embedding')]

In [15]:
from llama_stack_client import LlamaStackClient, RAGDocument

pdf_urls = [
    "https://arxiv.org/pdf/2304.08641.pdf",
    "https://www.cs.cmu.edu/~tom/mlbook/NBayesLogReg.pdf",
]

documents = [
    RAGDocument(
        document_id=f"pdf-{i}",
        content=url,  # still use `content` for the URL
        mime_type="application/pdf",
        metadata={"source": "arxiv" if "arxiv" in url else "cmu"},
    )
    for i, url in enumerate(pdf_urls)
]


In [24]:
urls = [
    ("https://www.openshift.guide/openshift-guide-screen.pdf", "application/pdf"),
]
documents = [
    RAGDocument(
        document_id=f"num-{i}",
        content=url,
        mime_type=url_type,
        metadata={},
    )
    for i, (url, url_type) in enumerate(urls)
]



In [25]:
documents

[{'document_id': 'num-0',
  'content': 'https://www.openshift.guide/openshift-guide-screen.pdf',
  'mime_type': 'application/pdf',
  'metadata': {}}]

In [27]:
vector_db_id = "demoInnovate-vectordb"
client.vector_dbs.register(
    vector_db_id=vector_db_id,
    embedding_model="all-MiniLM-L6-v2",
    embedding_dimension=384,
)

INFO:llama_stack_client._base_client:Retrying request to /v1/vector-dbs in 0.453451 seconds
INFO:llama_stack_client._base_client:Retrying request to /v1/vector-dbs in 0.781721 seconds


APIConnectionError: Connection error.

Ingesting documents into a vector database:

In [20]:
rich.print(documents)

In [22]:
client.tool_runtime.rag_tool.insert(
    documents=documents,
    vector_db_id=vector_db_id,
    chunk_size_in_tokens=512,
)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 500 Internal Server Error"
INFO:llama_stack_client._base_client:Retrying request to /v1/tool-runtime/rag-tool/insert in 0.426489 seconds
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 500 Internal Server Error"
INFO:llama_stack_client._base_client:Retrying request to /v1/tool-runtime/rag-tool/insert in 0.761061 seconds
INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/insert "HTTP/1.1 500 Internal Server Error"


InternalServerError: Error code: 500 - {'detail': 'Internal server error: An unexpected error occurred.'}

In [8]:
prompt = "What are the top 5 topics that were explained? Only list succinct bullet points."

In [9]:
# higher level tool provides packaged results, can span multiple dbs
tool_response = client.tool_runtime.rag_tool.query(
    content=prompt, vector_db_ids=[vector_db_id]
)
rich.print(tool_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/tool-runtime/rag-tool/query "HTTP/1.1 200 OK"


In [10]:
# we can also query the vector db directly
db_response = client.vector_io.query(
    vector_db_id=vector_db_id,
    query=prompt,
)
rich.print(db_response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/vector-io/query "HTTP/1.1 200 OK"


In [11]:
# prompt_context = tool_response.content
prompt_context = "\n".join([c.content for c in db_response.chunks])

In [12]:
messages = [{"role": "system", "content": "You are a helpful assistant."}]
extended_prompt = f"""
Please answer the given query using the context below.

QUERY:
{prompt}

CONTEXT:
{prompt_context}
"""
messages.append({"role": "user", "content": extended_prompt})
rich.print(messages)

In [13]:
response = client.inference.chat_completion(
    messages=messages,
    model_id="llama3.2:3b-instruct-fp16",
    timeout=600
)
rich.print(response)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/inference/chat-completion "HTTP/1.1 200 OK"


In [14]:
# Left as an extra exercise for the reader
from llama_stack_client import Agent, AgentEventLogger

rag_agent = Agent(
    client, 
    model="llama3.2:3b-instruct-fp16",
    instructions="You are a helpful assistant",
    tools = [
        {
          "name": "builtin::rag/knowledge_search",
          "args" : {
            "vector_db_ids": [vector_db_id],
          }
        }
    ],
)

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:8321/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"


In [15]:
import uuid
from llama_stack_client.lib.agents.event_logger import EventLogger
user_prompts = [
    "What is Lora?"
]
session_id = rag_agent.create_session(f"rag session-{uuid.uuid4()}")
for prompt in user_prompts:
    rich.print(f"User> {prompt}")
    response = rag_agent.create_turn(
        messages=[{"role": "user", "content": prompt}],
        session_id=session_id,
    )
    for log in EventLogger().log(response):
        log.print()

INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/326578e4-7e72-4068-bbd5-1804814f17a3/session "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:8321/v1/agents/326578e4-7e72-4068-bbd5-1804814f17a3/session/f02e6687-c49b-43cc-b489-444f22205294/turn "HTTP/1.1 200 OK"


[33minference> [0m[33m[k[0m[33mnowledge[0m[33m_search[0m[33m(query[0m[33m="[0m[33mL[0m[33mora[0m[33m")][0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'Lora'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: .\n\n*Sounds great! How do I use it?*\n\nYou can finetune using any of our recipes with the ``lora_`` prefix, e.g. :ref:`lora_finetune_single_device<lora_finetune_recipe_label>`. These recipes utilize\nLoRA-enabled model builders, which we support for all our models, and also use the ``lora_`` prefix, e.g.\nthe :func:`torchtune.models.llama3.llama3` model has a corresponding :func:`torchtune.models.llama3.lora_llama3`.\nWe aim to provide a comprehensive set of configurations to allow you to get started with training with LoRA quickly,\njust specify any config with ``_lora`` in