# Agentic RAG

### Download Phi-3-medium

In [None]:
import huggingface_hub as hf_hub
from pathlib import Path

llm_model_id = "OpenVINO/Phi-3-medium-4k-instruct-int4-ov"
llm_model_path = "Phi-3-medium-4k-instruct-int4-ov"

if not Path(llm_model_path).exists():
    hf_hub.snapshot_download(llm_model_id, local_dir=llm_model_path)

### Initialize LLM

In [None]:
from llama_index.llms.openvino import OpenVINOLLM

ov_config = {
    "PERFORMANCE_HINT": "LATENCY",
    "NUM_STREAMS": "1",
    "CACHE_DIR": "",
}

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>{message.content}<|end|>"
        elif message.role == "user":
            prompt += f"<|user|>{message.content}<|end|>"
        elif message.role == "assistant":
            prompt += f"<|assistant|>{message.content}<|end|>"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>"):
        prompt = "<|system|><|end|>" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"

    return prompt

def completion_to_prompt(completion):
    return f"<|system|><|end|><|user|>{completion}<|end|><|assistant|>\n"

ov_llm = OpenVINOLLM(
    model_id_or_path=llm_model_path,
    context_window=3900,
    max_new_tokens=1024,
    model_kwargs={"ov_config": ov_config},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    device_map="cpu",
)

In [None]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stop_tokens = ["<|endoftext|>"]
stop_tokens = ov_llm._tokenizer.convert_tokens_to_ids(stop_tokens)
stop_tokens = [StopOnTokens(stop_tokens)]
ov_llm._stopping_criteria = StoppingCriteriaList(stop_tokens)

### Export and initilize Embedding model

In [None]:
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding

embedding_model_path = "bge-small-en-v1.5-ov"
ov_embedding = OpenVINOEmbedding(model_id_or_path=embedding_model_path, device="CPU")

### Build tool of vector search

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import VectorStoreIndex, Settings

Settings.embed_model = ov_embedding
Settings.llm = ov_llm
loader = PyMuPDFReader()
documents = loader.load(file_path=text_example_en_path)
index = VectorStoreIndex.from_documents(documents)

vector_tool = QueryEngineTool(
    index.as_query_engine(streaming=True),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for basic facts about Intel Xeon 6 processors",
    ),
)

### Build tools of calculator

In [None]:
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool


def multiply(a: float, b: float) -> float:
    """Multiply two numbers and returns the product"""
    return a * b


multiply_tool = FunctionTool.from_defaults(fn=multiply)


def divide(a: float, b: float) -> float:
    """Add two numbers and returns the sum"""
    return a / b


divide_tool = FunctionTool.from_defaults(fn=divide)

### Create an Agent

In [None]:
agent = ReActAgent.from_tools([multiply_tool, divide_tool, vector_tool], llm=ov_llm, verbose=True)

In [None]:
agent.reset()

In [None]:
response = agent.chat("What's the maximum number of cores in an Intel Xeon 6 processor server with 4 sockets ?")

> Running step 84b4d6ed-32d8-41f8-843a-ef8370bead54. Step input: What's the maximum number of cores in an Intel Xeon 6 processor server with 4 sockets ?
[1;3;34mObservation: Error: Could not parse output. Please follow the thought-action-input format. Try again.
[0m> Running step 61799bc6-f9d2-4d2b-b99a-68cc6182ade2. Step input: None
[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: vector_search
Action Input: {'input': 'maximum number of cores in an Intel Xeon 6 processor server with 4 sockets'}
[0m[1;3;34mObservation: Up to 144 cores per socket in 1- or 2-socket configurations.

To calculate the maximum number of cores in a server with 4 sockets using Intel Xeon 6 processors with E-cores, we can multiply the number of cores per socket by the number of sockets:

144 cores/socket * 4 sockets = 576 cores

Therefore, the maximum number of cores in an Intel Xeon 6 processor server with 4 sockets is 576 cor

In [None]:
response = agent.chat("How about increase total number of sockets by 14% ?")

> Running step 60fea1b1-8682-483a-bad4-e4626480d56a. Step input: How about increase total number of sockets by 14% ?
[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: To increase the total number of sockets by 14%, we first need to calculate the new number of sockets.

New number of sockets = 4 sockets + (14% of 4 sockets)
New number of sockets = 4 + (0.14 * 4)
New number of sockets = 4 + 0.56
New number of sockets = 4.56

Since we cannot have a fraction of a socket, we will round up to the nearest whole number.

New number of sockets = 5

Now, we need to find the maximum number of cores for the Intel Xeon 6 processor with 5 sockets.

The Intel Xeon 6 processor has 12 cores per socket.

Maximum number of cores = 12 cores/socket * 5 sockets
Maximum number of cores = 60 cores

So, with an increase of 14% in the total number of sockets, the Intel Xeon 6 processor server will have a maximum of 60 cores.
[0m