# Agentic RAG

### Download Phi-3-medium

In [1]:
import huggingface_hub as hf_hub
from pathlib import Path

llm_model_id = "OpenVINO/Phi-3-medium-4k-instruct-int4-ov"
llm_model_path = "Phi-3-medium-4k-instruct-int4-ov"

if not Path(llm_model_path).exists():
    hf_hub.snapshot_download(llm_model_id, local_dir=llm_model_path)

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

openvino_detokenizer.xml:   0%|          | 0.00/2.61k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.68k [00:00<?, ?B/s]

openvino_detokenizer.bin:   0%|          | 0.00/500k [00:00<?, ?B/s]

openvino_model.xml:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

openvino_tokenizer.xml:   0%|          | 0.00/6.43k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/7.41G [00:00<?, ?B/s]

openvino_tokenizer.bin:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

### Initialize LLM

In [2]:
from llama_index.llms.openvino import OpenVINOLLM

ov_config = {
    "PERFORMANCE_HINT": "LATENCY",
    "NUM_STREAMS": "1",
    "CACHE_DIR": "",
}

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>{message.content}<|end|>"
        elif message.role == "user":
            prompt += f"<|user|>{message.content}<|end|>"
        elif message.role == "assistant":
            prompt += f"<|assistant|>{message.content}<|end|>"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>"):
        prompt = "<|system|><|end|>" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"

    return prompt

def completion_to_prompt(completion):
    return f"<|system|><|end|><|user|>{completion}<|end|><|assistant|>\n"

ov_llm = OpenVINOLLM(
    model_id_or_path=llm_model_path,
    context_window=3900,
    max_new_tokens=1024,
    model_kwargs={"ov_config": ov_config},
    generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    device_map="gpu",
)


Compiling the model to GPU ...


In [3]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

stop_tokens = ["<|endoftext|>"]
stop_tokens = ov_llm._tokenizer.convert_tokens_to_ids(stop_tokens)
stop_tokens = [StopOnTokens(stop_tokens)]
ov_llm._stopping_criteria = StoppingCriteriaList(stop_tokens)

### Export and initilize Embedding model

In [4]:
from llama_index.embeddings.huggingface_openvino import OpenVINOEmbedding

embedding_model_path = "bge-small-en-v1.5-ov"
ov_embedding = OpenVINOEmbedding(model_id_or_path=embedding_model_path, device="CPU")

Compiling the model to CPU ...


### Build tool of vector search

In [6]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter

text_example_en_path = Path("text_example_en.pdf")
Settings.embed_model = ov_embedding
Settings.llm = ov_llm
loader = PyMuPDFReader()
documents = loader.load(file_path=text_example_en_path)
index = VectorStoreIndex.from_documents(
    documents,
    transformations=[SentenceSplitter(chunk_size=200, chunk_overlap=40)],
)

vector_tool = QueryEngineTool(
    index.as_query_engine(streaming=True),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for basic facts about Intel Xeon 6 processors",
    ),
)

### Build tools of calculator

In [7]:
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool


def multiply(a: float, b: float) -> float:
    """Multiply two numbers and returns the product"""
    return a * b


multiply_tool = FunctionTool.from_defaults(fn=multiply)


def divide(a: float, b: float) -> float:
    """Add two numbers and returns the sum"""
    return a / b


divide_tool = FunctionTool.from_defaults(fn=divide)

### Create an Agent

In [23]:
agent = ReActAgent.from_tools([multiply_tool, divide_tool, vector_tool], llm=ov_llm, verbose=True)

In [32]:
agent.reset()

In [33]:
response = agent.chat("What's the maximum number of cores in an Intel Xeon 6 processor server with 4 sockets ?")

> Running step 05df0d98-ca32-4ac3-9bcb-bf5b1d1847f8. Step input: What's the maximum number of cores in an Intel Xeon 6 processor server with 4 sockets ?
[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: vector_quality
Action Input: {'input': 'Intel Xeon 6 processor server with 4 sockets maximum cores'}
[0m[1;3;34mObservation: Error: No such tool named `vector_quality`.
[0m> Running step 3bf8025d-1bbe-4ca8-843c-00e7babf819c. Step input: None
[1;3;38;5;200mThought: The tool `vector_quality` does not exist. I will use the `vector_search` tool to find information about Intel Xeon 6 processors.
Action: vector_search
Action Input: {'input': 'Intel Xeon 6 processor maximum cores with 4 sockets'}
[0m[1;3;34mObservation: Based on the provided context information, the Intel Xeebon 6 processor with Efficient-cores can have up to 144 cores per socket. However, the information does not specify the maximum number o

In [34]:
response = agent.chat("How about increase total number of sockets by 35% ? Go step by step, using a tool to do any math.")

> Running step 276b5e9e-4725-4a3a-abf7-f1ac0a0c4fdb. Step input: How about increase total number of sockets by 35% ? Go step by step, using a tool to do any math.
[1;3;38;5;200mThought: I need to use the multiply tool to calculate the new total number of sockets.
Action: multiply
Action Input: {'a': 4, 'b': 1.35}
[0m[1;3;34mObservation: 5.4
[0m> Running step a2881a50-1ab2-43a9-b265-013d835187bc. Step input: None
[1;3;34mObservation: Error: Could not parse output. Please follow the thought-action-input format. Try again.
[0m> Running step ac386f4b-04a6-4acb-aa19-7d5fa5217468. Step input: None
[1;3;38;5;200mThought: The output from the previous action is not in a valid format. I will round the result to the nearest whole number since the number of sockets must be a whole number. Then, I will use the multiply tool again to calculate the new total number of cores.
Action: multiply
Action Input: {'a': 144, 'b': 5}
[0m[1;3;34mObservation: 720
[0m> Running step 71280572-594d-4002-a6