# Lab 3. Create a native Agent with OpenVINO

## Create LLM as agent

In [1]:
from pathlib import Path
from modelscope import snapshot_download
llm_model_id = "snake7gun/Qwen2-7B-Instruct-int4-ov"
# llm_model_id = "snake7gun/Qwen2.5-3B-Instruct-int4-ov"
llm_local_path  = "./model/snake7gun/Qwen2-7B-Instruct-int4-ov"
# llm_local_path  = "./model/snake7gun/Qwen2.5-3B-Instruct-int4-ov"

if not Path(llm_local_path).exists():
    model_dir = snapshot_download(llm_model_id, cache_dir="./model/")

### Select inference device for LLM

In [2]:
from notebook_utils import device_widget

llm_device = device_widget("CPU", exclude=["NPU"])

llm_device

[31m[ERROR] 21:31:21.463 [NPUBackends] Cannot find backend for inference. Make sure the device is available.[0m


Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')

## Instantiate LLM using Optimum Intel

In [12]:
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer, AutoConfig, TextStreamer
from transformers.generation import (
    StoppingCriteriaList,
    StoppingCriteria,
)
import openvino.properties as props
import openvino.properties.hint as hints
import openvino.properties.streams as streams

import json
import json5
import requests
import torch

tokenizer = AutoTokenizer.from_pretrained(llm_local_path, trust_remote_code=True)

ov_config = {hints.performance_mode(): hints.PerformanceMode.LATENCY, streams.num(): "1", props.cache_dir(): ""}

llm = OVModelForCausalLM.from_pretrained(
    llm_local_path,
    device=llm_device.value,
    ov_config=ov_config,
    config=AutoConfig.from_pretrained(llm_local_path, trust_remote_code=True),
    trust_remote_code=True,
)

llm.generation_config.top_k = 1
llm.generation_config.max_length = 2000

### Create text generation method

In [13]:
class StopSequenceCriteria(StoppingCriteria):
    """
    This class can be used to stop generation whenever a sequence of tokens is encountered.

    Args:
        stop_sequences (`str` or `List[str]`):
            The sequence (or list of sequences) on which to stop execution.
        tokenizer:
            The tokenizer used to decode the model outputs.
    """

    def __init__(self, stop_sequences, tokenizer):
        if isinstance(stop_sequences, str):
            stop_sequences = [stop_sequences]
        self.stop_sequences = stop_sequences
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs) -> bool:
        decoded_output = self.tokenizer.decode(input_ids.tolist()[0])
        return any(decoded_output.endswith(stop_sequence) for stop_sequence in self.stop_sequences)


def text_completion(prompt: str, stop_words) -> str:
    im_end = "<|im_end|>"
    if im_end not in stop_words:
        stop_words = stop_words + [im_end]
    streamer = TextStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)

    stopping_criteria = StoppingCriteriaList([StopSequenceCriteria(stop_words, tokenizer)])
    input_ids = torch.tensor([tokenizer.encode(prompt)])
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        stopping_criteria=stopping_criteria,
    )
    output = llm.generate(**generate_kwargs)
    output = output.tolist()[0]
    output = tokenizer.decode(output, errors="ignore")
    assert output.startswith(prompt)
    output = output[len(prompt) :].replace("<|endoftext|>", "").replace(im_end, "")

    for stop_str in stop_words:
        idx = output.find(stop_str)
        if idx != -1:
            output = output[: idx + len(stop_str)]
    return output

## Create prompt template

In [14]:
TOOL_DESC = """{name_for_model}: Call this tool to interact with the {name_for_human} API. What is the {name_for_human} API useful for? {description_for_model} Parameters: {parameters}"""

PROMPT_REACT = """Answer the following questions as best you can. You have access to the following APIs:

{tools_text}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tools_name_text}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {query}"""

Meanwhile we have to create function for consolidate the tools information and conversation history into the prompt template.

In [15]:
def build_input_text(chat_history, list_of_tool_info) -> str:
    tools_text = []
    for tool_info in list_of_tool_info:
        tool = TOOL_DESC.format(
            name_for_model=tool_info["name_for_model"],
            name_for_human=tool_info["name_for_human"],
            description_for_model=tool_info["description_for_model"],
            parameters=json.dumps(tool_info["parameters"], ensure_ascii=False),
        )
        if tool_info.get("args_format", "json") == "json":
            tool += " Format the arguments as a JSON object."
        elif tool_info["args_format"] == "code":
            tool += " Enclose the code within triple backticks (`) at the beginning and end of the code."
        else:
            raise NotImplementedError
        tools_text.append(tool)
    tools_text = "\n\n".join(tools_text)

    tools_name_text = ", ".join([tool_info["name_for_model"] for tool_info in list_of_tool_info])

    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    for i, (query, response) in enumerate(chat_history):
        if list_of_tool_info:
            if (len(chat_history) == 1) or (i == len(chat_history) - 2):
                query = PROMPT_REACT.format(
                    tools_text=tools_text,
                    tools_name_text=tools_name_text,
                    query=query,
                )
        if query:
            messages.append({"role": "user", "content": query})
        if response:
            messages.append({"role": "assistant", "content": response})

    prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False, return_tensors="pt")

    return prompt

## Create parser

In [16]:
def parse_latest_tool_call(text):
    tool_name, tool_args = "", ""
    i = text.rfind("\nAction:")
    j = text.rfind("\nAction Input:")
    k = text.rfind("\nObservation:")
    if 0 <= i < j:  # If the text has `Action` and `Action input`,
        if k < j:  # but does not contain `Observation`,
            # then it is likely that `Observation` is ommited by the LLM,
            # because the output text may have discarded the stop word.
            text = text.rstrip() + "\nObservation:"  # Add it back.
        k = text.rfind("\nObservation:")
        tool_name = text[i + len("\nAction:") : j].strip()
        tool_args = text[j + len("\nAction Input:") : k].strip()
        text = text[:k]
    return tool_name, tool_args, text

## Create tools calling

In [17]:
tools = [
    {
        "name_for_human": "get weather",
        "name_for_model": "get_weather",
        "description_for_model": 'Get the current weather in a given city name."',
        "parameters": [
            {
                "name": "city_name",
                "description": "City name",
                "required": True,
                "schema": {"type": "string"},
            }
        ],
    },
    {
        "name_for_human": "image generation",
        "name_for_model": "image_gen",
        "description_for_model": "AI painting (image generation) service, input text description, and return the image URL drawn based on text information.",
        "parameters": [
            {
                "name": "prompt",
                "description": "describe the image",
                "required": True,
                "schema": {"type": "string"},
            }
        ],
    },
]

Then we should implement these tools with inputs and outputs, and execute them according to the output of LLM.

In [18]:
def call_tool(tool_name: str, tool_args: str) -> str:
    if tool_name == "get_weather":
        city_name = json5.loads(tool_args)["city_name"]
        key_selection = {
            "current_condition": [
                "temp_C",
                "FeelsLikeC",
                "humidity",
                "weatherDesc",
                "observation_time",
            ],
        }
        resp = requests.get(f"https://wttr.in/{city_name}?format=j1")
        resp.raise_for_status()
        resp = resp.json()
        ret = {k: {_v: resp[k][0][_v] for _v in v} for k, v in key_selection.items()}
        return str(ret)
    elif tool_name == "image_gen":
        import urllib.parse

        prompt = json5.loads(tool_args)["prompt"]
        prompt = urllib.parse.quote(prompt)
        return json.dumps(
            {"image_url": f"https://image.pollinations.ai/prompt/{prompt}"},
            ensure_ascii=False,
        )
    else:
        raise NotImplementedError


def llm_with_tool(prompt: str, history, list_of_tool_info=()):
    chat_history = [(x["user"], x["bot"]) for x in history] + [(prompt, "")]

    planning_prompt = build_input_text(chat_history, list_of_tool_info)
    text = ""
    while True:
        output = text_completion(planning_prompt + text, stop_words=["Observation:", "Observation:\n"])
        action, action_input, output = parse_latest_tool_call(output)
        if action:
            observation = call_tool(action, action_input)
            output += f"\nObservation: = {observation}\nThought:"
            observation = f"{observation}\nThought:"
            print(observation)
            text += output
        else:
            text += output
            break

    new_history = []
    new_history.extend(history)
    new_history.append({"user": prompt, "bot": text})
    return text, new_history

## Run agent

[back to top ⬆️](#Table-of-contents:)

In [19]:
history = []
query = "get the weather in London, and create a picture of Big Ben based on the weather information"

response, history = llm_with_tool(prompt=query, history=history, list_of_tool_info=tools)

Thought: I need to use the get_weather API to get the weather information in London first, then use the image_gen API to generate an image based on the weather information.
Action: get_weather
Action Input: {"city_name": "London"}
Observation:
{'current_condition': {'temp_C': '16', 'FeelsLikeC': '16', 'humidity': '94', 'weatherDesc': [{'value': 'Partly cloudy'}], 'observation_time': '02:00 AM'}}
{'current_condition': {'temp_C': '16', 'FeelsLikeC': '16', 'humidity': '94', 'weatherDesc': [{'value': 'Partly cloudy'}], 'observation_time': '02:00 AM'}}
Thought:
 I got the weather information in London, which includes temperature, humidity, and weather conditions. Now I will use the image_gen API to generate an image based on this information.
Action: image_gen
Action Input: {"prompt": "Big Ben under partly cloudy sky with temperature 16 degrees Celsius and humidity 94%"}
Observation:
{"image_url": "https://image.pollinations.ai/prompt/Big%20Ben%20under%20partly%20cloudy%20sky%20with%20tempe