# Local ReAct Agent on Intel Lunar Lake
In this notebook we will show how to build a ReAct Agent on your Intel Lunar Lake laptop!

In [None]:
# Uncomment to install dependencies
# ! pip install langchain datasets pandas nltk sentence-transformers langchain-community faiss-cpu
# import nltk

# nltk.download('punkt')
# nltk.download('punkt_tab')

Build a local database to store local files.
The difference here from RAG is that we won't pass every user message to the retriever.
This Retriever will be used as a tool for the agent to access local files.

In [None]:
from datasets import load_dataset
from nltk.tokenize import sent_tokenize
from functools import reduce
import pandas as pd
import os

from langchain_community.embeddings import OpenVINOBgeEmbeddings
from langchain_core.documents.base import Document

# If there is a problem with import with onnx please install onnx version under 1.16.2:
# ! pip install onnx<1.16.2

def articles_to_passages(articles, sent_count_per_passage=3):
    """Split a list of articles to a list of passages"""
    def map(text):
        
        sents = sent_tokenize(text)
        sentence_df = pd.DataFrame(sents, columns=["sentence"]).reset_index()
        sentence_df["batch"] = sentence_df["index"] // sent_count_per_passage
        passages = list(sentence_df.groupby("batch")["sentence"].apply(lambda x: " ".join(x)))
        return passages
    return reduce(lambda l1, l2: l1 + l2, [map(p) for p in articles], [])

model_name =  "BAAI/bge-small-en-v1.5"
save_name = './bge-small-en-v1.5_openvino'
saved = os.path.exists(save_name)
load_name = save_name if saved else model_name
embedding_function = OpenVINOBgeEmbeddings(
    model_name_or_path=load_name,
    model_kwargs={"device": "CPU"},
)

if not saved:
    embedding_function.save_model(save_name)

def parse_dataset_month(dataset):
    sports_articles = dataset.filter(lambda e: "sport" in e["link"])["content"]
    sports_articles = pd.DataFrame(sports_articles).drop_duplicates()[0].to_list()
    # Split documents to passages
    sport_passages = articles_to_passages(sports_articles)
    return sport_passages

In [None]:
from langchain_community.vectorstores import FAISS

index_path = "./faiss_index"
if os.path.exists(index_path):
    database = FAISS.load_local(index_path, embedding_function, allow_dangerous_deserialization=True)
else:
    all_ds = []
    for month in ["2024-02","2024-03","2024-04","2024-05"]:
        ds = load_dataset('RealTimeData/bbc_news_alltime', month)
        all_ds.append(ds)
        
    sport_passages = []
    for ds in all_ds:
        sport_passages += parse_dataset_month(ds["train"])
    database = FAISS.from_documents([Document(page_content=doc) for doc in sport_passages], embedding_function)
    database.save_local(index_path)
    print(f'Number of sports arcticles found: {len(sport_passages)}\nNumber of embedded passages: {len(sport_passages)}')

Next we will initilize a retriever from the dataset.
We override the `_get_relevant_documents` method to enable a control over the number of documents the retriever will return for every query.

In [None]:
def _get_relevant_documents(self, query, *, run_manager):
    search_kwargs = {k:v for k,v in self.search_kwargs.items()}

    if "top_k" in run_manager.metadata:
        search_kwargs["k"] = run_manager.metadata["top_k"]
    if self.search_type == "similarity":
        docs = self.vectorstore.similarity_search(query, **search_kwargs)
    elif self.search_type == "similarity_score_threshold":
        docs_and_similarities = (
            self.vectorstore.similarity_search_with_relevance_scores(
                query, **search_kwargs
            )
        )
        docs = [doc for doc, _ in docs_and_similarities]
    elif self.search_type == "mmr":
        docs = self.vectorstore.max_marginal_relevance_search(
            query, **search_kwargs
        )
    else:
        raise ValueError(f"search_type of {self.search_type} not allowed.")
    return [d.page_content for d in docs]


retriever = database.as_retriever()
type(retriever)._get_relevant_documents = _get_relevant_documents

Next we will initialize our LLM which will be the backbone of the agent.

In [None]:
# Uncomment to install dependencies
# ! pip install optimum[openvino,nncf]

In [None]:
import time
import itertools
from threading import Thread
from transformers import (
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList,
    GenerationConfig,
)

class SuffixCriteria(StoppingCriteria):
    def __init__(self, start_length, eof_strings, tokenizer, check_fn=None):
        self.start_length = start_length
        self.eof_strings = eof_strings
        self.tokenizer = tokenizer
        if check_fn is None:
            check_fn = lambda decoded_generation: any(
                [decoded_generation.endswith(stop_string) for stop_string in self.eof_strings]
            )
        self.check_fn = check_fn

    def __call__(self, input_ids, scores, **kwargs):
        """Returns True if generated sequence ends with any of the stop strings"""
        decoded_generations = self.tokenizer.batch_decode(input_ids[:, self.start_length :])
        return all([self.check_fn(decoded_generation) for decoded_generation in decoded_generations])

Here you can choose to how many bits you want to quantize your model.

In [None]:
bits = 8  # 4

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
from functools import wraps
from transformers import AutoTokenizer
import os


model_name = "microsoft/Phi-3-mini-128k-instruct"
save_name = model_name.split("/")[-1] + f"_openvino_{bits}bit"
quantization_config = OVWeightQuantizationConfig(
    bits=4,
    sym=False,
    group_size=128,
    ratio=0.8,
)
device = "gpu"
saved = os.path.exists(save_name)
load_kwargs = {
    "device": device,
    "export": not saved,
}
if bits == 4:
    load_kwargs["quantization_config"] = quantization_config
elif bits == 8:
    load_kwargs["load_in_8bit"] = True

tokenizer = AutoTokenizer.from_pretrained(model_name)
s = """```

```"""
stop_words_list =["Observation:", s]
stopping_criteria = SuffixCriteria(0, stop_words_list, tokenizer)

ov_llm = HuggingFacePipeline.from_model_id(
    model_id=model_name if not saved else save_name,
    task="text-generation",
    backend="openvino",
    model_kwargs=load_kwargs,
    pipeline_kwargs={
        "stopping_criteria": StoppingCriteriaList([stopping_criteria]),
        "eos_token_id": tokenizer.convert_tokens_to_ids(["<|endoftext|>", "<|end|>", "<|system|>", "<|user|>", "<|assistant|>"]),
        "pad_token_id": tokenizer.eos_token_id,
        "return_full_text": False,
    }
)

if not saved:
    # For some reason LC passes the model_kwargs to the tokenizer aswell and this can cause issues when saving
    for k in load_kwargs:
        ov_llm.pipeline.tokenizer.__dict__['init_kwargs'].pop(k, None)
    ov_llm.pipeline.save_pretrained(save_name)
    

original_generate = HuggingFacePipeline._generate

@wraps(original_generate)
def _generate_with_kwargs(*args, **kwargs):
    pipeline_kwargs = kwargs.get("run_manager").metadata.get("pipeline_kwargs", {})
    return original_generate(*args, **kwargs, pipeline_kwargs=pipeline_kwargs)

HuggingFacePipeline._generate = _generate_with_kwargs

## React
Next, we will define the system prompt of the agent. 
This will give the LLM the instructions on how to act when recieving a prompt and present the tools that are availalbe for the agent to use.

In [None]:
AGENT_PROMPT = """<|system|>You are designed to help with a variety of tasks, from answering questions to providing summaries to other types of analyses.

## Tools

You have access to a wide variety of tools. You are responsible for using the tools in any sequence you deem appropriate to complete the task at hand.
This may require breaking the task into subtasks and using different tools to complete each subtask.

You have access to the following tools:
{tools}


## Output Format

### Tool Format
If you have enough information to answer the question and don't need to use any tool, skip to the Answer Format section.
Please use the following format when using a tool:

```
Thought: I need to use a tool to help me answer the question.
Action: [tool name (one of {tool_names}) if using a tool.]
Action Input: [the input to the tool, in a JSON format representing the kwargs]
Observation:
```

Please ALWAYS start with a Thought.

NEVER surround your response with markdown code markers. You may use code markers within your response if you need to.

Please use a valid JSON format for the Action Input.

You should keep repeating the above format till you have enough information to answer the question without using any more tools. At that point, you MUST respond in the one of the following two answer formats.

### Answer Format

```
Thought: I can answer without using any more tools.
Answer: [your answer here]<|end|>
```

```
Thought: I cannot answer the question with the provided tools.
Answer: [your answer here]<|end|>
```

## Current Conversation

Below is the current conversation consisting of interleaving human and assistant messages.
{chat_history}
<|user|>
Human: {input}<|end|>
<|assistant|>
{agent_scratchpad}
"""

In [None]:
from langchain import PromptTemplate
mod_readct_temp = PromptTemplate(template=AGENT_PROMPT, input_variables=['agent_scratchpad', 'chat_history', 'input', 'tools'])

Now we can define the tools that the agent will be able to use

In [None]:
from langchain import hub
from langchain.tools.retriever import create_retriever_tool


rag_tool = create_retriever_tool(
    retriever,
    "search_my_local_files",
    """Searches over the user's local documents. Use the following format: {"query": [your input goes here]}""",
)

In [None]:
from langchain_core.prompts import base as prompts_base_imp, BasePromptTemplate
from typing import Dict, Union
from langchain_core.documents.base import Document


def _get_document_info(doc: Union[Document,str], prompt: BasePromptTemplate[str]) -> Dict:
    if type(doc) == str:
        doc_modified = f"""Success! I have found the information that I need:
{doc}
I will now use this information to ansewer the question."""
        base_info = {"page_content": doc_modified}
    else:
        base_info = {"page_content": doc.page_content, **doc.metadata}
    missing_metadata = set(prompt.input_variables).difference(base_info)
    if len(missing_metadata) > 0:
        required_metadata = [
            iv for iv in prompt.input_variables if iv != "page_content"
        ]
        raise ValueError(
            f"Document prompt requires documents to have metadata variables: "
            f"{required_metadata}. Received document with missing metadata: "
            f"{list(missing_metadata)}."
        )
    return {k: base_info[k] for k in prompt.input_variables}
prompts_base_imp._get_document_info = _get_document_info

In [None]:
## Bing Search
from langchain.tools.bing_search import BingSearchRun
import os
from langchain_community.utilities import BingSearchAPIWrapper

# Follow these instructions to acquite a subscription key:
# https://learn.microsoft.com/en-us/bing/search-apis/bing-web-search/create-bing-search-service-resource
os.environ["BING_SUBSCRIPTION_KEY"] = "<<<Enter your Bing subscription key here>>>"
os.environ["BING_SEARCH_URL"] = "https://api.bing.microsoft.com/v7.0/search"

class ReactBingSearchAPIWrapper(BingSearchAPIWrapper):
    def run(self, query: str) -> str:
        """Run query through BingSearch and parse result."""
        joined_snippets = super().run(query)
        return f"""The answer to the query '{query}' is:
{joined_snippets.replace('...','')}
"""
        
bing_search = ReactBingSearchAPIWrapper(k=2)

bing_search_tool = BingSearchRun(
    api_wrapper=bing_search,
    name="bing_search",
    description="""Searches over the internet with Bing. Use the following format: {{"query": [your input goes here]}}""",
)

In [None]:
tools_for_agent = [rag_tool, bing_search_tool]

In [None]:
from langchain.agents import AgentExecutor, create_react_agent

agent = create_react_agent(ov_llm, tools_for_agent, mod_readct_temp)
agent_executor = AgentExecutor(agent=agent, tools=tools_for_agent, verbose=True, handle_parsing_errors=False, return_intermediate_steps=True)

In [None]:
from langchain.agents.output_parsers.react_single_input import ReActSingleInputOutputParser, FINAL_ANSWER_ACTION, MISSING_ACTION_AFTER_THOUGHT_ERROR_MESSAGE, MISSING_ACTION_INPUT_AFTER_ACTION_ERROR_MESSAGE
import json
import re
from langchain_core.exceptions import OutputParserException
from langchain_core.agents import AgentAction, AgentFinish
from langchain_core.exceptions import OutputParserException
import logging
from langchain.agents.agent import AgentOutputParser
from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS

from typing import Union

def extract_json_from_string(string):
    found = False
    for index, char in enumerate(string):
        if char in "{[":
            found = True
            break
    if not found:
        return {}
    string = string[index:]
    try:
        json_parsed = json.loads(string)
        return json_parsed
    except json.JSONDecodeError as json_exception:
        json_end_index = json_exception.pos
        json_parsed = json.loads(string[:json_end_index])
        return json_parsed

def parse(self, text: str) -> Union[AgentAction, AgentFinish]:
    includes_answer = FINAL_ANSWER_ACTION in text or "Answer:" in text
    regex = (
        r"Action\s*\d*\s*:[\s]*(.*?)[\s]*Action\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
    )
    action_match = re.search(regex, text, re.DOTALL)
    if action_match:
        action = action_match.group(1).strip()
        action_input = action_match.group(2)
        tool_input = action_input.strip(" ")
        tool_input = tool_input.strip('"')
        
        tool_input = tool_input.replace("Observation:","")
        tool_input = tool_input.replace("{{","{").replace("}}","}")
        try:    
            tool_input = extract_json_from_string(tool_input)
        except Exception as e:
            pass

        return AgentAction(action, tool_input, text)

    elif includes_answer:
        return AgentFinish(
            {"output": text.split(FINAL_ANSWER_ACTION)[-1].strip()}, text
        )
    else:
        return AgentFinish(
            {"output": text}, text
        )

ReActSingleInputOutputParser.parse = parse

In [None]:
agent_executor.agent.stream_runnable = False

## Chatbot with ReAct Agent
We are now ready to build our chatbot demo with the agent.
We will use [Gradio](https://www.gradio.app/) to build our demo.

First, we will define our chat memory and modify our template and chain to be able to handle chat memory

In [None]:
# Uncomment to install dependencies
# ! pip install gradio

In [None]:
from transformers import TextIteratorStreamer, AutoTokenizer

class TextStreamerFlagException(Exception):
    pass

class FlaggedTextIteratorStreamer(TextIteratorStreamer):
    class Flag:
        pass
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.flag = self.Flag()
    
    def put_flag(self):
        self.text_queue.put(self.flag)

    def __next__(self):
        value = super().__next__()
        if value is self.flag:
            raise TextStreamerFlagException()
        else:
            return value

In [None]:
from langchain_core.messages.base import BaseMessage
from langchain.memory import ConversationBufferMemory


def parse_chat_history(chat_history):
    role_map = {"human": "<|user|> Human:", "ai": "<|assistant|>", "context": ""}
    buffer = ""
    for dialogue_turn in chat_history:
        assert isinstance(dialogue_turn, BaseMessage)
        role_prefix = role_map[dialogue_turn.type]
        role_suffix = "<|end|>" if dialogue_turn.type in ["context", "ai"] else ""
        buffer += f"\n{role_prefix} {dialogue_turn.content} {role_suffix}"
        buffer = buffer[:-1] if buffer[-1] == "\n" else buffer
    return buffer

def add_to_memory(memory, question, context, answer):
    memory.chat_memory.add_messages([
        BaseMessage(
            content=question,
            type="human"
        ),
        BaseMessage(
            content=context,
            type="context"
        ),
        BaseMessage(
            content=answer,
            type="ai"
        )
    ])

def delete_last_message_from_memory(memory):
    del memory.chat_memory.messages[-3:]

memory = ConversationBufferMemory(memory_key="chat_history", ai_prefix="Assistant", human_prefix="User")
prompt = mod_readct_temp

In [None]:
from langchain_core.runnables import RunnableLambda
from operator import itemgetter

rag_chain_agent = (
    {
        "input": itemgetter("question"),
        "chat_history": itemgetter("chat_history") | RunnableLambda(parse_chat_history),
    }
    | RunnableLambda(func=lambda x: x)
    | {
        "answer": agent_executor
    }
)

Next we will write our core functions generation function for our demo

In [None]:
import time
import itertools
from threading import Thread
from transformers import (
    TextIteratorStreamer,
    StoppingCriteria,
    StoppingCriteriaList,
    GenerationConfig,
)
from langchain.schema.runnable import RunnableConfig
from threading import Thread


class ThreadWithResult(Thread):
    """
    Modified Thread class to save the return value of the target function
    Based on https://stackoverflow.com/a/65447493
    """

    def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None):
        def function():
            self._result = target(*args, **kwargs)
        super().__init__(group=group, target=function, name=name, daemon=daemon)

    @property
    def result(self):
        self.join()
        return self._result

def is_partial_stop(output, stop_str):
    """
    Check whether the output contains a partial stop str.

    Params:
      output: current output from the model
      stop_str: a string we will want to generation on
    Returns:
      True if the suffix of the output is a prefix of the stop_str
    """
    for i in range(0, min(len(output), len(stop_str))):
        if stop_str.startswith(output[-i:]):
            return True
    return False

def format_context(context):
    """
    Utility function to format retrieved documents inside the chatbot window

    Params:
      context: retrived documents
    Returns:
      Formated string with the retrieved documents
    """
    if len(context) == 0:
        return ""
    blockquote_style = """font-size: 12px;
background: #e4e4e4;
border-left: 10px solid #ccc; 
margin: 0.5em 30px;
padding: 0.5em 10px;
color: black;"""
    summary_style = """font-weight: bold;
font-size: 14px;
list-style-position: outside;
margin: 0.5em 15px;
padding: 0px 0px 10px 15px;"""
    s = f'<details style="margin:0px;padding:0px;"><summary style="{summary_style}">Retrieved documents:</summary>'
    for doc in context:
        d = doc.replace("\n", " ")
        s += f'<blockquote style="{blockquote_style}"><p>{d}</p></blockquote>'
    s += "</details>"
    return s

def prepare_for_regenerate(history):
    """
    Delete last assistant response from memory in order to regenerate it

    Params:
      history: conversation history
    Returns:
      Updated history
    """
    history[-1][1] = None
    delete_last_message_from_memory(memory)
    return history, *([gr.update(interactive=False)] * 6)

def add_user_text(message, history):
    """
    Add user's message to chatbot history

    Params:
      message: current user message
      history: conversation history
    Returns:
      Updated history, clears user message and status
    """
    # Append current user message to history with a blank assistant message which will be generated by the model
    history.append([message.strip(), None])
    return "", history, *([gr.update(interactive=False)] * 5)

def reset_chatbot():
    """Clears demo contents and resets chat history"""
    memory.clear()
    return None, None, "Status: Idle"

def get_all_contexts_used(agent_result):
    return "\n".join([str(agent_res[1]) for agent_res in agent_result["intermediate_steps"]])

def clean_text(text):
    removed_ticks = text.replace("```","")
    for s_word in stop_words_list:
        removed_ticks = removed_ticks.rstrip(s_word)
    return removed_ticks

def generate(
    history,
    temperature,
    max_new_tokens,
    top_p,
    repetition_penalty,
    num_retrieved_docs,
):
    """
    Generates the assistant's reponse given the chatbot history and generation parameters

    Params:
      history: conversation history formated in pairs of user and assistant messages `[user_message, assistant_message]`
      temperature:  parameter for control the level of creativity in AI-generated text.
                    By adjusting the `temperature`, you can influence the AI model's probability distribution, making the text more focused or diverse.
      max_new_tokens: The maximum number of tokens we allow the model to generate as a response.
      top_p: parameter for control the range of tokens considered by the AI model based on their cumulative probability.
      repetition_penalty: parameter for penalizing tokens based on how frequently they occur in the text.
      num_retrieved_docs: number of documents to retrieve in case of RAG
    Yields:
      Updated history and generation status.
    """
    if len(history) == 0 or history[-1][1] is not None:
        yield history, "Status: Idle", *([gr.update(interactive=True)] * 6)
        return
    prompt_char = '▌'
    history[-1][1] = prompt_char
    yield history, "Status: Generating...", *([gr.update(interactive=False)] * 6)
    
    start = time.perf_counter()
    user_query = history[-1][0]
    current_chain = rag_chain_agent
    tokenizer = ov_llm.pipeline.tokenizer
    streamer = FlaggedTextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    stop_str = ["```\n\n```"]
    
    # Prepare input for generate
    generation_config = GenerationConfig(
        max_new_tokens=max_new_tokens,
        do_sample=temperature > 0.0,
        temperature=temperature if temperature > 0.0 else 1.0,
        repetition_penalty=repetition_penalty,
        top_p=top_p,
    )
    generate_kwargs = dict(
        streamer=streamer,
        generation_config=generation_config,
    )
    chain_kwargs = {
        "config": RunnableConfig(metadata={
            "top_k": num_retrieved_docs,
            "pipeline_kwargs": generate_kwargs
        })}

    def target(*args, **kwargs):
        out = current_chain.invoke(*args, **kwargs)
        streamer.put_flag()
        return out
    # Call chain
    t1 = ThreadWithResult(
        target=target,
        args=[{"question": user_query, "chat_history": memory.chat_memory.messages}],
        kwargs=chain_kwargs,
    )
    t1.start()

    # Initialize an empty string to store the generated text.
    partial_text = ""
    generated_tokens = 0
    try:
        while True:
            for new_text in streamer:
                partial_text += new_text
                generated_tokens += 1
                history[-1][1] = partial_text + prompt_char
                pos = -1
                for s in stop_str:
                    if (pos := partial_text.rfind(s)) != -1:
                        break
                if pos != -1:
                    partial_text = partial_text[:pos]
                    raise TextStreamerFlagException()
                elif any([is_partial_stop(partial_text, s) for s in stop_str]):
                    continue
                yield history, "Status: Generating...", *([gr.update(interactive=False)] * 6)
            partial_text += "\n"
    except TextStreamerFlagException:
        pass
    history[-1][1] = partial_text
    chain_out = t1.result
    if "intermediate_steps" in chain_out["answer"]:
        current_context = get_all_contexts_used(chain_out["answer"])
        if current_context != "":
            history[-1][1] = partial_text + format_context([current_context])
    else:
        current_context = ""

    add_to_memory(memory, user_query, current_context, partial_text)
    generation_time = time.perf_counter() - start
    yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 6)

Let's add an option to chat with our own documents by loading them to our database.

In [None]:
# ! pip install pypdf

In [None]:
from pypdf import PdfReader 


added_documents_ids = []


def pdf_to_docs(file_path):
    reader = PdfReader(file_path)
    texts = [page.extract_text() for page in reader.pages]
    return [Document(page_content=p) for p in articles_to_passages(texts)]


def load_files(files):
    yield (
        f'Loading...', 
        *([gr.update(interactive=False)] * 6),
    )
    start = time.perf_counter()
    for fp in files:
        documents = pdf_to_docs(fp)
        added_documents_ids.append(database.add_documents(documents))
    upload_time = time.perf_counter() - start
    yield (
        f'Load time: {upload_time * 1000:.2f}ms', 
        *([gr.update(interactive=True)] * 5),
        gr.update(value=f"Delete documents 〈{len(added_documents_ids)}〉", interactive=True),
    )


def delete_documents():
    yield (
        f'Deleting...', 
        *([gr.update(interactive=False)] * 6),
    )
    global added_documents_ids
    for l in added_documents_ids:
        database.delete(l)
    added_documents_ids = []
    yield (
        f'Status: Idle',
        *([gr.update(interactive=True)] * 5),
        gr.update(value=f"Delete documents 〈{len(added_documents_ids)}〉", interactive=True),
    )

Now we can build the actual demo using Gradio.
The layout will be simple, a chatbow window followed by a text prompt with controls that will let you submit a message, clear chat and regenerate the last answer, this is pretty standard for a chatbot demo.
We have also added the option to add PDF documents to the database and delete them if required.
You can extend the add documents option to support other formats than PDF.

In [None]:
import gradio as gr

try:
    demo.close()
except:
    pass

EXAMPLES_EDUCATION = [
    "Lily drops a rubber ball from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?",
    "Mark has 15 notebooks in his backpack. Each day, he uses 3 notebooks for his classes. After 4 days, how many notebooks will Mark have left in his backpack?",
]
EXAMPLES_BBC = [
    "How many teams will The 2024-25 Champions League feature?",
    "Search in my files how many teams will the Champions League feature this year?",    
]

AGENT = [
    "What are the best three locations to visit in Seattle today?"
]

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown('<h1 style="text-align: center;">Intel Labs Demo: Prompt ReAct with Phi-3 on Intel Lunar Lake iGPU</h1>')
    chatbot = gr.Chatbot(height=800)
    with gr.Row():
        msg = gr.Textbox(placeholder="Enter message here...", show_label=False, autofocus=True, scale=75)
        status = gr.Textbox("Status: Idle", show_label=False, max_lines=1, scale=20, visible=False)
    with gr.Row():
        submit = gr.Button("Submit", variant="primary")
        regenerate = gr.Button("Regenerate")
        clear = gr.Button("Clear")
        load = gr.UploadButton("Load Document", file_types=['pdf'], file_count='multiple')
        delete_docs = gr.Button(lambda: f"Delete documents {f'〈{len(added_documents_ids)}〉'}", interactive=True)
    with gr.Accordion("Advanced Options:", open=False):
        with gr.Row():
            with gr.Column():
                temperature = gr.Slider(
                    label="Temperature",
                    value=0.0,
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    interactive=True,
                )
                max_new_tokens = gr.Slider(
                    label="Max new tokens",
                    value=128,
                    minimum=0,
                    maximum=512,
                    step=32,
                    interactive=True,
                )
            with gr.Column():
                top_p = gr.Slider(
                    label="Top-p (nucleus sampling)",
                    value=1.0,
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    interactive=True,
                )
                repetition_penalty = gr.Slider(
                    label="Repetition penalty",
                    value=1.0,
                    minimum=1.0,
                    maximum=2.0,
                    step=0.1,
                    interactive=True,
                )
            num_documents = gr.Slider(
                label="Retrieved documents numbers",
                value=1,
                minimum=1,
                maximum=10,
                step=1,
                interactive=True
                )
    gr.Examples(
        AGENT, inputs=msg, label="Agent examples"
    )
    gr.Examples(
        EXAMPLES_EDUCATION, inputs=msg, label="Non-RAG examples"
    )
    gr.Examples(
        EXAMPLES_BBC, inputs=msg, label="RAG with BBC Sports examples"
    )       
    buttons = [submit, regenerate, clear, load, delete_docs]
    # Sets generate function to be triggered when the user submit a new message
    gr.on(
        triggers=[submit.click, msg.submit],
        fn=add_user_text,
        inputs=[msg, chatbot],
        outputs=[msg, chatbot, *buttons],
        concurrency_limit=1,
        queue=True,
    ).then(
        fn=generate,
        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, num_documents],
        outputs=[chatbot, status, msg, *buttons],
        concurrency_limit=1,
        queue=True
    )
    regenerate.click(
        fn=prepare_for_regenerate,
        inputs=chatbot,
        outputs=[chatbot, msg, *buttons],
        concurrency_limit=1,
        queue=True,
    ).then(
        fn=generate,
        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, num_documents],
        outputs=[chatbot, status, msg, *buttons],
        concurrency_limit=1,
        queue=True
    )
    clear.click(fn=reset_chatbot, inputs=None, outputs=[chatbot, msg, status], queue=True)
    load.upload(
        fn=load_files,
        inputs=[load],
        outputs=[status, msg, *buttons],
        concurrency_limit=1,
        queue=True,
    )
    delete_docs.click(fn=delete_documents, outputs=[status, msg, *buttons], concurrency_limit=1, queue=True)

In [None]:
memory.clear()
demo.launch(inline=False, inbrowser=True)

In [None]:
# demo.close()