# setup

In [1]:
#core deep learning framework
from transformers import  AutoTokenizer
from optimum.intel import OVModelForCausalLM ,OVModelForFeatureExtraction

#prompt managme
from faiss import IndexFlatL2
from collections import deque
import numpy as np

#utillty
from os.path import join

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


In [2]:
tokenizer = AutoTokenizer.from_pretrained(join("quantized_model","tokenizer"))
model = OVModelForCausalLM.from_pretrained(join("quantized_model","INT_8"))#,export=True)

Compiling the model to CPU ...
Setting OpenVINO CACHE_DIR to quantized_model/INT_8/model_cache


In [3]:
tokenizer.pad_token = tokenizer.eos_token

In [4]:
#u can use any feature extraction model here
emb_tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
emb_model = OVModelForFeatureExtraction.from_pretrained("thenlper/gte-small",export=True)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.0+cu121
Overriding 1 configuration item(s)
	- use_cache -> False
Compiling the model to CPU ...


# utils

In [5]:
def shape_dict(d):
    return {k:np.array(v).shape for k,v in d.items()}

In [6]:
#used almost everywhere for chatbot apis this is a good thing to know
#I found myself writing this function in every chatbot I ever made
def openai_format(text: str, role ='system'):
    assert role in ('assistant','user','system')
    return {'content':text,'role':role}

# database

In [7]:
# List of product descriptions for the SereniTea chatbot
product_descriptions = [
    """Matcha is a premium green tea powder from Japan. Once prepared, it becomes a vibrant green beverage with a creamy mouthfeel. Its unique flavor is rich and grassy with undertones of umami. Traditionally used in tea ceremonies, Matcha is also a popular ingredient in modern culinary dishes and beverages.""",
    
    """Chamomile tea is a herbal infusion made from dried chamomile flowers and is renowned for its mild and soothing flavor that hints at a light apple sweetness. Widely consumed for its calming effects and its ability to improve sleep quality, Chamomile tea is a nighttime favorite.""",
    
    """Oolong tea, a traditional Chinese tea, is made from leaves of the same plant that gives us green and black tea. Its oxidation process is stopped somewhere between the standards for green and black tea, giving it a complexity of flavor and aroma that can range from bright and floral to rich and savory.""",
    
    """Black tea is known for its strong flavors and is the most oxidized of all tea types. It can range in flavor from sweet and malty to robust and smoky. Regular consumption of black tea has been linked to a variety of health benefits, including improved cholesterol levels and better gut health and immunity."""
    # Additional product descriptions would be appended to the list
]


In [8]:
batch_dict = tokenizer(product_descriptions, max_length=512, padding=True, truncation=True, return_tensors='pt')
shape_dict(batch_dict)

{'input_ids': (4, 79), 'attention_mask': (4, 79)}

In [9]:
outputs=model(**batch_dict)
shape_dict(outputs)

  self.request.start_async(inputs, shared_memory=True)


{'logits': (4, 79, 32000), 'past_key_values': (32, 2, 4, 32, 79, 128)}

In [10]:
def get_last_key(outputs):
    #geting the last key
    emb=np.array(outputs['past_key_values'])[-1][0] #shape = (Batch, 32, Time, 128)
    #combining over attention heads
    emb=emb.swapaxes(1,2) #shape = (Batch, Time, 32, 128) 
    return emb.reshape(emb.shape[:2]+(-1,)) #shape = (Batch, Time, 4096)

emb=get_last_key(outputs)
print(emb.shape)

(4, 79, 4096)


In [11]:
def average_pool(array, attention_mask):
    return (array*attention_mask[:, :, np.newaxis]).sum(1)/attention_mask.sum(1)[:,np.newaxis]

pool=average_pool(emb,batch_dict['attention_mask'].numpy())
pool.shape

(4, 4096)

In [12]:
def get_embedding(outputs,inputs):
    key=get_last_key(outputs)
    return average_pool(key,inputs['attention_mask'].numpy())

embeddings=get_embedding(outputs,batch_dict)
embeddings.shape

(4, 4096)

## embedding

In [13]:
def average_pool(last_hidden_states,attention_mask):
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def make_embedding(texts):
    # Tokenize the input texts
    batch_dict = emb_tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

    outputs = emb_model(**batch_dict)
    #print(outputs.keys())
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    return embeddings.cpu().numpy()

In [14]:
embeddings=make_embedding(product_descriptions)
embeddings

array([[-0.4337214 , -0.14420573,  0.3533751 , ..., -0.68104327,
         0.6894532 , -0.17023948],
       [-0.18854867, -0.04905184,  0.3122417 , ...,  0.08611374,
         0.9116094 ,  0.01603312],
       [ 0.05724301, -0.17354068,  0.3363754 , ..., -0.18687753,
         0.90574044,  0.22618802],
       [-0.11154529, -0.17711419,  0.12071578, ..., -0.43974218,
         0.8788762 ,  0.20621175]], dtype=float32)

## faiss

In [15]:
database_index=IndexFlatL2(embeddings.shape[-1])
database_index.add(embeddings)
database_index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7fcf19e0b660> >

In [16]:
emb=make_embedding(['I am having stomach issues what tea is best for me?'])
ans=database_index.search(emb,1)
ans

(array([[24.36969]], dtype=float32), array([[3]]))

In [17]:
product_descriptions[ans[1][0][0]]

'Black tea is known for its strong flavors and is the most oxidized of all tea types. It can range in flavor from sweet and malty to robust and smoky. Regular consumption of black tea has been linked to a variety of health benefits, including improved cholesterol levels and better gut health and immunity.'

In [18]:
def get_product(text):
    emb=make_embedding([text])
    ans=database_index.search(emb,1)
    return product_descriptions[ans[1][0][0]]

get_product('hey I want to add some tea to my dish')

'Oolong tea, a traditional Chinese tea, is made from leaves of the same plant that gives us green and black tea. Its oxidation process is stopped somewhere between the standards for green and black tea, giving it a complexity of flavor and aroma that can range from bright and floral to rich and savory.'

# chatbot
this is being debugged rn something is very off about it. 
seems like the model has a strong tendency to repeat the prompt
and when we try and corect that by adding a no repeat we see it do small modifications to get away with it

In [19]:
def get_system_prompt(user_text):
    product=get_product(user_text)
    return openai_format(f'''\
you are a sales chatbot in charge of the tea shop 'Moms Homebrew Tea' help line.
you will help users figure out which of our products fits them best. 
based on the curent conversation the system retrived this SPECIFIC product:
"{product}"
tell the user about the parts that are relevent to them and explain that we make the tea in house by specialists.
if the user isnt happy with the tea. or if you think this product isnt the best fit try and get the user to be more specific''')

get_system_prompt('hi')

{'content': 'you are a sales chatbot in charge of the tea shop \'Moms Homebrew Tea\' help line.\nyou will help users figure out which of our products fits them best. \nbased on the curent conversation the system retrived this SPECIFIC product:\n"Matcha is a premium green tea powder from Japan. Once prepared, it becomes a vibrant green beverage with a creamy mouthfeel. Its unique flavor is rich and grassy with undertones of umami. Traditionally used in tea ceremonies, Matcha is also a popular ingredient in modern culinary dishes and beverages."\ntell the user about the parts that are relevent to them and explain that we make the tea in house by specialists.\nif the user isnt happy with the tea. or if you think this product isnt the best fit try and get the user to be more specific',
 'role': 'system'}

In [20]:
# def get_system_prompt(user_text):
#      return openai_format('say AAA to the user regardless of their message')

In [21]:
def respond_to_user(user_text,history):
    #prompt enginerring
    history.append(openai_format(user_text,'user'))
    messages=(get_system_prompt(user_text),)+tuple(history)
    print(messages)

    print([x['role'] for x in messages])
    #infrence
    inputs=tokenizer.apply_chat_template(messages,return_tensors='pt',add_generation_prompt=True)
    output=model.generate(inputs,repetition_penalty=1.2,max_new_tokens=100,min_new_tokens=12,no_repeat_ngram_size=3,
                          num_beams=2,top_k=10,do_sample=False,)
    output=tokenizer.batch_decode(output[:,inputs.shape[-1]:],skip_special_tokens=True)
    #output=tokenizer.batch_decode(output,skip_special_tokens=True)
    #output
    history.append(openai_format(output,'assistant'))
    return output

history=deque(maxlen=3)
#ans=respond_to_user('hey I need some tea to help with my stomach issues do you guys sell any?',history)
ans=respond_to_user('I am having stomach issues what tea is best for me?',history)
ans


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


({'content': 'you are a sales chatbot in charge of the tea shop \'Moms Homebrew Tea\' help line.\nyou will help users figure out which of our products fits them best. \nbased on the curent conversation the system retrived this SPECIFIC product:\n"Black tea is known for its strong flavors and is the most oxidized of all tea types. It can range in flavor from sweet and malty to robust and smoky. Regular consumption of black tea has been linked to a variety of health benefits, including improved cholesterol levels and better gut health and immunity."\ntell the user about the parts that are relevent to them and explain that we make the tea in house by specialists.\nif the user isnt happy with the tea. or if you think this product isnt the best fit try and get the user to be more specific', 'role': 'system'}, {'content': 'I am having stomach issues what tea is best for me?', 'role': 'user'})
['system', 'user']


['\n\n[INST] >>SYS<<\n\nYou are a chatbot for a tea shop. You are in charge for the help line of Moms Home Brewed Tea.\nYou will help the user find the right tea for them.\nBased on the current conversation the chatbot has retrieved this specific product: "Black tea has a strong flavor and is known to be the most oxygenated of all teas. It ranges from sweet to malty, to']

In [22]:
history

deque([{'content': 'I am having stomach issues what tea is best for me?',
        'role': 'user'},
       {'content': ['\n\n[INST] >>SYS<<\n\nYou are a chatbot for a tea shop. You are in charge for the help line of Moms Home Brewed Tea.\nYou will help the user find the right tea for them.\nBased on the current conversation the chatbot has retrieved this specific product: "Black tea has a strong flavor and is known to be the most oxygenated of all teas. It ranges from sweet to malty, to'],
        'role': 'assistant'}],
      maxlen=3)

# UI

In [23]:
max_new_tokens = 256

class StopOnTokens(StoppingCriteria):
    def __init__(self, token_ids):
        self.token_ids = token_ids
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_id in self.token_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

if stop_tokens is not None:
    if isinstance(stop_tokens[0], str):
        stop_tokens = tok.convert_tokens_to_ids(stop_tokens)
        
    stop_tokens = [StopOnTokens(stop_tokens)]

def default_partial_text_processor(partial_text:str, new_text:str):
    """
    helper for updating partially generated answer, used by de
    
    Params:
      partial_text: text buffer for storing previosly generated text
      new_text: text update for the current step
    Returns:
      updated text string
    
    """
    partial_text += new_text
    return partial_text

text_processor = model_configuration.get("partial_text_processor", default_partial_text_processor)


def get_uuid():
    """
    universal unique identifier for thread
    """
    return str(uuid4())

NameError: name 'StoppingCriteria' is not defined

In [24]:
# history=deque(maxlen=3)
#streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)

In [25]:
def respond_to_user(user_text,history):
    #prompt enginerring
    history.append(openai_format(user_text,'user'))
    messages=(get_system_prompt(user_text),)+tuple(history)

    #infrence
    inputs=tokenizer.apply_chat_template(messages,return_tensors='pt',add_generation_prompt=True)
    output=model.generate(inputs,repetition_penalty=1.2,max_new_tokens=100,min_new_tokens=12,no_repeat_ngram_size=3,
                          num_beams=2,top_k=10,do_sample=False,)
    output=tokenizer.batch_decode(output[:,inputs.shape[-1]:],skip_special_tokens=True)
    #output=tokenizer.batch_decode(output,skip_special_tokens=True)
    #output
    history.append(openai_format(output,'assistant'))
    return output[0]

In [26]:
import gradio as gr


history=deque(maxlen=3)
display_history = []  # Initialize history list

def submit_response(message):
    # Update history with user message
    display_history.append(("user", message))
    
    # Get the bot's response and update 
    print(history)
    bot_response = respond_to_user(message, history)
    display_history.append(("bot", "  ."+bot_response))
    
    # Return the updated history to be displayed in the chat
    return display_history

with gr.Blocks() as demo:
    gr.Markdown("<h1><center>OpenVINO Chatbot</center></h1>")
    chatbot = gr.Chatbot(height=100)
    msg = gr.Textbox(placeholder="Type your message here...", lines=2)
    
    # When the button is clicked, the submit_response function is called and the output updates the chatbot
    submit = gr.Button("Submit").click(submit_response, inputs=msg, outputs=chatbot)
    clear = gr.Button("Clear").click(lambda: chatbot.clear(), inputs=None, outputs=chatbot)

demo.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


deque([], maxlen=3)


  self.request.start_async(inputs, shared_memory=True)


In [None]:
# please run this cell for stopping gradio interface
demo.close()