# Chat with document using OpenVINO and LangChain

## Prerequisites
[back to top ⬆️](#Table-of-contents:)

Install required dependencies

In [7]:
%pip install -q --extra-index-url https://download.pytorch.org/whl/cpu\
"git+https://github.com/huggingface/optimum-intel.git"\
"gradio"\
"onnx" "chromadb" "sentence_transformers" "langchain" "langchainhub" "transformers>=4.34.0" "unstructured"

Note: you may need to restart the kernel to use updated packages.


## Import

In [1]:
from pathlib import Path
from embedding import OpenVINO_Embeddings
from transformers import AutoModel, AutoTokenizer, AutoConfig, TextIteratorStreamer, pipeline
from optimum.intel import OVQuantizer
from optimum.intel.openvino import OVModelForCausalLM
import openvino as ov
import torch
import nncf
import logging
import numpy as np
import shutil
import gc
import ipywidgets as widgets

2023-12-05 07:34:04.793766: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-05 07:34:04.795769: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-05 07:34:04.822059: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-05 07:34:04.822082: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-05 07:34:04.822105: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


## Convert embedding model

In [2]:
SUPPORTED_EMBEDDING_MODELS = {"all-mpnet-base-v2": {"model_id": "sentence-transformers/all-mpnet-base-v2"}}
embedding_model_id = list(SUPPORTED_EMBEDDING_MODELS)

embedding_model_id = widgets.Dropdown(
    options=embedding_model_id,
    value=embedding_model_id[-1],
    description="Embedding Model:",
    disabled=False,
)

embedding_model_id

Dropdown(description='Embedding Model:', options=('all-mpnet-base-v2',), value='all-mpnet-base-v2')

In [3]:
embedding_model_configuration = SUPPORTED_EMBEDDING_MODELS[embedding_model_id.value]
print(f"Selected embedding model {embedding_model_id.value}")

Selected embedding model all-mpnet-base-v2


In [3]:
model = AutoModel.from_pretrained(embedding_model_configuration["model_id"])

input_shape = ov.PartialShape([-1, -1])
dummy_inputs = {"input_ids": torch.ones((1, 10), dtype=torch.long), "attention_mask": torch.ones(
    (1, 10), dtype=torch.long)}
input_info = [("input_ids", input_shape, np.int64),
              ("attention_mask", input_shape, np.int64)]

ov_model = ov.convert_model(model, example_input=dummy_inputs)
ov.save_model(ov_model, Path(embedding_model_id.value)  / "openvino_model.xml")

tokenizer = AutoTokenizer.from_pretrained(embedding_model_configuration["model_id"])
tokenizer.save_pretrained(Path(embedding_model_id.value))

('embedding_model/tokenizer_config.json',
 'embedding_model/special_tokens_map.json',
 'embedding_model/vocab.txt',
 'embedding_model/added_tokens.json',
 'embedding_model/tokenizer.json')

## Convert LLM model

In [4]:
SUPPORTED_LLM_MODELS = {"llama-2-chat-7b": {"model_id": "meta-llama/Llama-2-7b-chat-hf"}}
llm_model_id = list(SUPPORTED_LLM_MODELS)

llm_model_id = widgets.Dropdown(
    options=llm_model_id,
    value=llm_model_id[-1],
    description="LLM Model:",
    disabled=False,
)

llm_model_id

Dropdown(description='LLM Model:', options=('llama-2-chat-7b',), value='llama-2-chat-7b')

In [6]:
llm_model_configuration = SUPPORTED_LLM_MODELS[llm_model_id.value]
print(f"Selected LLM model {llm_model_id.value}")

Selected LLM model llama-2-chat-7b


In [7]:
from IPython.display import display

prepare_int4_model = widgets.Checkbox(
    value=True,
    description="Prepare INT4 model",
    disabled=False,
)
prepare_int8_model = widgets.Checkbox(
    value=False,
    description="Prepare INT8 model",
    disabled=False,
)
prepare_fp16_model = widgets.Checkbox(
    value=False,
    description="Prepare FP16 model",
    disabled=False,
)

display(prepare_int4_model)
display(prepare_int8_model)
display(prepare_fp16_model)

Checkbox(value=True, description='Prepare INT4 model')

Checkbox(value=False, description='Prepare INT8 model')

Checkbox(value=False, description='Prepare FP16 model')

In [9]:
nncf.set_log_level(logging.ERROR)

pt_model_id = llm_model_configuration["model_id"]
model_type = AutoConfig.from_pretrained(pt_model_id, trust_remote_code=True).model_type
fp16_model_dir = Path(llm_model_id.value) / "FP16"
int8_model_dir = Path(llm_model_id.value) / "INT8_compressed_weights"
int4_model_dir = Path(llm_model_id.value) / "INT4_compressed_weights"


def convert_to_fp16():
    if (fp16_model_dir / "openvino_model.xml").exists():
        return
    ov_model = OVModelForCausalLM.from_pretrained(
        pt_model_id, export=True, compile=False
    )
    ov_model.half()
    ov_model.save_pretrained(fp16_model_dir)
    del ov_model
    gc.collect()


def convert_to_int8():
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    int8_model_dir.mkdir(parents=True, exist_ok=True)
    if fp16_model_dir.exists():
        ov_model = OVModelForCausalLM.from_pretrained(fp16_model_dir, compile=False)
    else:
        ov_model = OVModelForCausalLM.from_pretrained(
            pt_model_id, export=True, compile=False
        )
        ov_model.half()
    quantizer = OVQuantizer.from_pretrained(ov_model)
    quantizer.quantize(save_directory=int8_model_dir, weights_only=True)
    del quantizer
    del ov_model
    gc.collect()


def convert_to_int4():
    compression_configs = {
        "llama-2-chat-7b": {
            "mode": nncf.CompressWeightsMode.INT4_SYM,
            "group_size": 128,
            "ratio": 0.8,
        },
        "default": {
            "mode": nncf.CompressWeightsMode.INT4_ASYM,
            "group_size": 128,
            "ratio": 0.8,
        },
    }

    model_compression_params = compression_configs.get(
        llm_model_id.value, compression_configs["default"]
    )
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    int4_model_dir.mkdir(parents=True, exist_ok=True)
    if not fp16_model_dir.exists():
        model = OVModelForCausalLM.from_pretrained(
            pt_model_id, export=True, compile=False
        ).half()
        model.config.save_pretrained(int4_model_dir)
        ov_model = model.model
        del model
        gc.collect()
    else:
        ov_model = ov.Core().read_model(fp16_model_dir / "openvino_model.xml")
        shutil.copy(fp16_model_dir / "config.json", int4_model_dir / "config.json")
    compressed_model = nncf.compress_weights(ov_model, **model_compression_params)
    ov.save_model(compressed_model, int4_model_dir / "openvino_model.xml")
    del ov_model
    del compressed_model
    gc.collect()


if prepare_fp16_model.value:
    convert_to_fp16()
if prepare_int8_model.value:
    convert_to_int8()
if prepare_int4_model.value:
    convert_to_int4()

## Select device for inference and model variant
[back to top ⬆️](#Table-of-contents:)

>**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU.

In [10]:
core = ov.Core()
device = widgets.Dropdown(
    options=core.available_devices + ["AUTO"],
    value="CPU",
    description="Device:",
    disabled=False,
)

device

Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')

## Load embedding model

In [11]:
embedding = OpenVINO_Embeddings.from_model_id(Path(embedding_model_id.value), model_kwargs={
                                              "device_name": device.value,  "config": {"PERFORMANCE_HINT": "THROUGHPUT"}})


## Load LLM model

In [13]:
available_models = []
if int4_model_dir.exists():
    available_models.append("INT4")
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

model_to_run

Dropdown(description='Model to run:', options=('INT4', 'FP16'), value='INT4')

In [16]:
from langchain.llms import HuggingFacePipeline

if model_to_run.value == "INT4":
    model_dir = int4_model_dir
elif model_to_run.value == "INT8":
    model_dir = int8_model_dir
else:
    model_dir = fp16_model_dir
print(f"Loading model from {model_dir}")

model_name = llm_model_configuration["model_id"]

ov_model = OVModelForCausalLM.from_pretrained(model_dir, device=device.value)
tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
pipe = pipeline("text-generation", model=ov_model, tokenizer=tokenizer, max_new_tokens=256, streamer=streamer)
llm = HuggingFacePipeline(pipeline=pipe)

Loading model from llama-2-chat-7b/FP16


Compiling the model to CPU ...
Setting OpenVINO CACHE_DIR to llama-2-chat-7b/FP16/model_cache


## Chat with Documents

In [24]:
from langchain import hub
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredMarkdownLoader
from threading import Event, Thread
import gradio as gr
import time
from uuid import uuid4


def build_chain(pdf_doc, chunk_size, chunk_overlap):
    loader = UnstructuredMarkdownLoader(pdf_doc.name)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    db = Chroma.from_documents(texts, embedding)
    retriever = db.as_retriever()

    global rag_chain
    prompt = hub.pull("rlm/rag-prompt")
    chain_type_kwargs = {"prompt": prompt}
    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs=chain_type_kwargs,
    )

    return "Ready"


def user(message, history):
    """
    callback function for updating user messages in interface on submit button click

    Params:
      message: current message
      history: conversation history
    Returns:
      None
    """
    # Append the user's message to the conversation history
    return "", history + [(message, None)]


def bot(history):
    stream_complete = Event()

    def infer(question):
        rag_chain.run(question)
        stream_complete.set()

    t1 = Thread(target=infer, args=(history[-1][0],))
    t1.start()
    history[-1][1] = ""
    for chunk in streamer:
        history[-1][1] += chunk
        time.sleep(0.05)
        yield history


def get_uuid():
    """
    universal unique identifier for thread
    """
    return str(uuid4())


with gr.Blocks(
    theme=gr.themes.Soft(),
    css=".disclaimer {font-variant-caps: all-small-caps;}",
) as demo:
    conversation_id = gr.State(get_uuid)
    gr.Markdown(f"""<h1><center>Chat with Documents</center></h1>""")
    with gr.Row():
        with gr.Column(scale=1):
            md_doc = gr.File(label="Load a markdown file", file_types=[".md"])
            load_md = gr.Button("Build Retriever")
            retriever_argument = gr.Accordion("Retriever Configuration")
            with retriever_argument:
                chunk_size = gr.Slider(
                        label="Chunk size",
                        value=500,
                        minimum=100,
                        maximum=2000,
                        step=50,
                        interactive=True,
                        info="Size of chunk",)

                chunk_overlap = gr.Slider(
                        label="Chunk overlap",
                        value=100,
                        minimum=0,
                        maximum=200,
                        step=10,
                        interactive=True,
                        info=(
                            "Overlap between 2 chunks"
                        ),)
            langchain_status = gr.Textbox(
                label="Status", placeholder="", interactive=False
            )
            # model_argument = gr.Accordion("Model Configuration")
            # with model_argument:
            #     top_k = gr.Slider(
            #             label="Top-k",
            #             value=50,
            #             minimum=0.0,
            #             maximum=200,
            #             step=1,
            #             interactive=True,
            #             info="Sample from a shortlist of top-k tokens — 0 to disable and sample from all tokens.",)

            #     top_p = gr.Slider(
            #             label="Top-p (nucleus sampling)",
            #             value=1.0,
            #             minimum=0.0,
            #             maximum=1,
            #             step=0.01,
            #             interactive=True,
            #             info=(
            #                 "Sample from the smallest possible set of tokens whose cumulative probability "
            #                 "exceeds top_p. Set to 1 to disable and sample from all tokens."
            #             ),)

            #     repetition_penalty = gr.Slider(
            #             label="Repetition Penalty",
            #             value=1.1,
            #             minimum=1.0,
            #             maximum=2.0,
            #             step=0.1,
            #             interactive=True,
            #             info="Penalize repetition — 1.0 to disable.",)

            #     temperature = gr.Slider(
            #             label="Temperature",
            #             value=0.1,
            #             minimum=0.0,
            #             maximum=1.0,
            #             step=0.1,
            #             interactive=True,
            #             info="Higher values produce more diverse outputs",)

        with gr.Column(scale=4):
            chatbot = gr.Chatbot(height=500)
            with gr.Row():
                with gr.Column():
                    msg = gr.Textbox(
                        label="Chat Message Box",
                        placeholder="Chat Message Box",
                        show_label=False,
                        container=False,
                    )
                with gr.Column():
                    with gr.Row():
                        submit = gr.Button("Submit")
                        stop = gr.Button("Stop")
                        clear = gr.Button("Clear")
    load_md.click(build_chain, inputs=[md_doc, chunk_size, chunk_overlap], outputs=[langchain_status], queue=False)
    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot, queue=True
    )
    submit_click_event = submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot, queue=True
    )
    stop.click(
        fn=None,
        inputs=None,
        outputs=None,
        cancels=[submit_event, submit_click_event],
        queue=False,
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue(max_size=2)
demo.launch(server_name="10.3.233.70", server_port=7862, share=False)

Running on local URL:  http://10.3.233.70:7862

To create a public link, set `share=True` in `launch()`.




Created a chunk of size 661, which is longer than the specified 500
Created a chunk of size 1558, which is longer than the specified 500
Created a chunk of size 812, which is longer than the specified 500
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  self.request.start_async(inputs, shared_memory=True)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  self.request.start_async(inputs, shared_memory=True)


In [25]:
demo.close()
del rag_chain

Closing server running on port: 7862
