# Chat with document using OpenVINO and LangChain

## Prerequisites
[back to top ⬆️](#Table-of-contents:)

Install required dependencies

In [7]:
%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\
"git+https://github.com/huggingface/optimum-intel.git"\
"gradio"\
"onnx" "chromadb" "sentence_transformers" "langchain" "langchainhub" "transformers>=4.34.0" "unstructured"

Note: you may need to restart the kernel to use updated packages.


## Import

In [1]:
from langchain import hub
from pathlib import Path
from embedding import OpenVINO_Embeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredMarkdownLoader
from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
from transformers import AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import torch
import numpy as np

2023-12-05 00:08:03.956856: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 00:08:03.959012: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-05 00:08:03.984891: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-05 00:08:03.984911: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-05 00:08:03.984933: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


## Convert embedding model

In [2]:
embedding_model_id = 'sentence-transformers/all-mpnet-base-v2'
embedding_model_path = Path("./embedding_model")

In [3]:
model = AutoModel.from_pretrained(embedding_model_id)

input_shape = ov.PartialShape([-1, -1])
dummy_inputs = {"input_ids": torch.ones((1, 10), dtype=torch.long), "attention_mask": torch.ones(
    (1, 10), dtype=torch.long)}
input_info = [("input_ids", input_shape, np.int64),
              ("attention_mask", input_shape, np.int64)]

ov_model = ov.convert_model(model, example_input=dummy_inputs)
ov.save_model(ov_model, embedding_model_path / "openvino_model.xml")

tokenizer = AutoTokenizer.from_pretrained(embedding_model_id)
tokenizer.save_pretrained(embedding_model_path)

('embedding_model/tokenizer_config.json',
 'embedding_model/special_tokens_map.json',
 'embedding_model/vocab.txt',
 'embedding_model/added_tokens.json',
 'embedding_model/tokenizer.json')

## Load embedding model

In [4]:
device = 'CPU'
embedding = OpenVINO_Embeddings.from_model_id(embedding_model_path, model_kwargs={
                                              "device_name": device,  "config": {"PERFORMANCE_HINT": "THROUGHPUT"}})


## Load LLM model

In [5]:
llm_model_id = "meta-llama/Llama-2-7b-chat-hf"
model = OVModelForCausalLM.from_pretrained(model_id=llm_model_id, device=device, export=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, streamer=streamer)
llm = HuggingFacePipeline(pipeline=pipe)

Framework not specified. Using pt to export to ONNX.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.0+cpu
Overriding 1 configuration item(s)
	- use_cache -> True
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
Compiling the model to CPU ...


## Gradio Demo

In [8]:
from threading import Event, Thread
import gradio as gr
import time
from threading import Thread


def loading_md():
    return "Loading..."


def build_chain(pdf_doc):
    loader = UnstructuredMarkdownLoader(pdf_doc.name)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    texts = text_splitter.split_documents(documents)
    db = Chroma.from_documents(texts, embedding)
    retriever = db.as_retriever()
    global rag_chain
    prompt = hub.pull("rlm/rag-prompt")
    chain_type_kwargs = {"prompt": prompt}
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs=chain_type_kwargs,
    )

    return "Ready"


def user(message, history):
    """
    callback function for updating user messages in interface on submit button click

    Params:
      message: current message
      history: conversation history
    Returns:
      None
    """
    # Append the user's message to the conversation history
    return "", history + [(message, None)]


def bot(history):
    stream_complete = Event()

    def infer(question):
        rag_chain.run(question)
        stream_complete.set()

    t1 = Thread(target=infer, args=(history[-1][0],))
    t1.start()
    history[-1][1] = ""
    for chunk in streamer:
        history[-1][1] += chunk
        time.sleep(0.05)
        yield history


if __name__ == "__main__":
    block = gr.Blocks()
    with block as demo:
        gr.Markdown("""<h1><center>Chat with Documents</center></h1>""")
        with gr.Row():
            with gr.Column(scale=1):
                md_doc = gr.File(label="Load a Markdown", file_types=[".md"])
                load_md = gr.Button("Build Retriever")
                langchain_status = gr.Textbox(
                    label="Status", placeholder="", interactive=False
                )
                # model_argument = gr.Accordion("Model Configuration")
                # with model_argument:
                #     top_k = gr.Slider(
                #             label="Top-k",
                #             value=50,
                #             minimum=0.0,
                #             maximum=200,
                #             step=1,
                #             interactive=True,
                #             info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",)
                    
                #     top_p = gr.Slider(
                #             label="Top-p (nucleus sampling)",
                #             value=1.0,
                #             minimum=0.0,
                #             maximum=1,
                #             step=0.01,
                #             interactive=True,
                #             info=(
                #                 "Sample from the smallest possible set of tokens whose cumulative probability "
                #                 "exceeds top_p. Set to 1 to disable and sample from all tokens."
                #             ),)

                #     repetition_penalty = gr.Slider(
                #             label="Repetition Penalty",
                #             value=1.1,
                #             minimum=1.0,
                #             maximum=2.0,
                #             step=0.1,
                #             interactive=True,
                #             info="Penalize repetition — 1.0 to disable.",)

                #     temperature = gr.Slider(
                #             label="Temperature",
                #             value=0.1,
                #             minimum=0.0,
                #             maximum=1.0,
                #             step=0.1,
                #             interactive=True,
                #             info="Higher values produce more diverse outputs",)

            with gr.Column(scale=4):
                chatbot = gr.Chatbot(height=500)
                question = gr.Textbox(
                    label="Question", placeholder="Type your question and hit Enter "
                )
                with gr.Row():
                    submit = gr.Button("Submit")
                    clear = gr.Button("Clear")
        load_md.click(
            build_chain, inputs=[md_doc], outputs=[langchain_status], queue=False
        )
        question.submit(
            user, [question, chatbot], [question, chatbot], queue=False
        ).then(bot, chatbot, chatbot, queue=True)
        submit.click(
            user, [question, chatbot], [question, chatbot], queue=False
        ).then(bot, chatbot, chatbot, queue=True)
        clear.click(lambda: None, None, chatbot, queue=False)

    demo.queue().launch(server_name='10.3.233.70', share=False)

ERROR:    [Errno 98] error while attempting to bind on address ('10.3.233.70', 7860): address already in use


Running on local URL:  http://10.3.233.70:7861

To create a public link, set `share=True` in `launch()`.


In [7]:
demo.close()