# Demo of a Tesla customer support assistant chatbot

In [1]:
import os
from typing import Tuple
import torch

# simple hack to support import module from parent directory
import sys
sys.path.append('../')

from rag_llama.core.retrievers import RerankRetriever
from rag_llama.core.generation import Llama, Dialog


Set proxy for notebook kernel, note this is only for my local environment and can be removed.

In [2]:
# os.environ['http_proxy'] = "http://127.0.0.1:1081"
# os.environ['https_proxy'] = "http://127.0.0.1:1081"

## Define system message and input query format templates. 
Note this are for single-turn chat, similar to how search works.

In [3]:
SYSTEM_MESSAGE_NORAG = """
You are an assistant to a Tesla customer support team. Your job is to answer customer's questions to the best of your ability.
"""

SYSTEM_MESSAGE_WITHRAG = SYSTEM_MESSAGE_NORAG + """
 Your role involves leveraging a set of reference documents to ensure accurate responses. 
 While some documents may not directly apply to every question, focus solely on those that seem pertinent. 
 Avoid referencing or citing documents not provided. 
 Craft concise answers, incorporating relevant sections from the provided documents to assist customers effectively.
"""

def get_formatted_input_dialog(query: str, doc_strs: str=None) -> Dialog:

    if doc_strs is not None and len(doc_strs) > 10:
        combined_query = f"Question:\n{query}\n\n####\n\nDocuments:\n{doc_strs}"
        dialog = [{'role': 'system', 'content': SYSTEM_MESSAGE_WITHRAG}, {'role': 'user', 'content': combined_query}]
    else:
        dialog = [{'role': 'system', 'content': SYSTEM_MESSAGE_NORAG},{'role': 'user', 'content': query}]
    
    return dialog

Create the reranking retrieval instance and the LLaMA 2 chat generator instance.

In [4]:
doc_embed_file = "../data/Tesla_manual_embeddings.pk"
llama_model_ckpt = os.path.expanduser("~/models/meta_llama2/llama-2-7b-chat/consolidated.pth")
llama_tokenizer_ckpt = os.path.expanduser("~/models/meta_llama2/tokenizer.model")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# LLM parameters
max_seq_len = 4096
max_gen_len = 1024
temperature = 0.6 # model sampling temperature
top_p = 0.9 # model sampling top P

# RAG specific parameters
top_k = 50 # select top K items to retrieve during naive retrieval
top_n = 3 # select top N items based on scores from reranking model

In [5]:
rerank_retriever = RerankRetriever(embed_file=doc_embed_file, device=device)

generator = Llama.build(
    ckpt_path=llama_model_ckpt,
    tokenizer_path=llama_tokenizer_ckpt,
    max_seq_len=max_seq_len,
    max_batch_size=2,
    device=device,
)

Loading sentence-transformers/all-MiniLM-L6-v2 model and tokenizer from HuggingFace...
Loading cross-encoder/ms-marco-MiniLM-L-6-v2 model and tokenizer from HuggingFace...
Starting to load tokenizer checkpoint '/home/michael/models/meta_llama2/tokenizer.model' ...
Starting to load model checkpoints '/home/michael/models/meta_llama2/llama-2-7b-chat/consolidated.pth' ...
Model checkpoint loaded in 16.15 seconds


## Main logic for retrieval and LLM generation

In [6]:
def run_chat_completions(query: str) -> Tuple[str]:
    """Run chat completion with the same query twice, one without RAG, and other with RAG"""

    retrieved_items = rerank_retriever.retrieve(query, top_k, top_n)

    # Join multiple documents into a single document string
    ref_doc_strs = "\n\n".join([item['formatted_text'] for item in retrieved_items])
    
    # build two dialogs with the same query, first without using RAG, second using RAG
    dialogs = [get_formatted_input_dialog(query, None), get_formatted_input_dialog(query, ref_doc_strs)]

    results = []
    # passing it separately is much faster due to the fact dialog without RAG is much shorter
    for dialog in dialogs:
        result = generator.chat_completion(
            [dialog],  # input needs to be a batch of dialogs
            max_gen_len=max_gen_len,
            temperature=temperature,
            top_p=top_p,
        )

        results.extend(result)

    response_without_rag = results[0]['generation']['content']
    response_with_rag = results[1]['generation']['content']

    # add reference documents at the end of response
    response_with_rag += f"\n\nReference documents:\n{ref_doc_strs}"

    return response_without_rag, response_with_rag


A simple hack to display the chat completions with and without RAG side-by-side for better comparison

In [7]:
from IPython.display import HTML, display

def display_completions_in_two_column(query, response, response_with_rag):
    """
    Display content in two columns side-by-side.
    
    Parameters:
    query (str): user query
    response (str): response for left column.
    response_with_rag (str): response for right column.
    """

    # Convert newlines to HTML line breaks and bulleted lists to HTML list items
    response = response.replace('\n', '<br>')
    response_with_rag = response_with_rag.replace('\n', '<br>')

    html_content = f'''
    <div>
        <div style="padding: 20px 0; font-weight: bold;">User: {query}</div>
        <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px;">
            <div>
                <div style="font-weight: bold;">Assistant:</div>
                {response}
            </div>
            <div>
                <div style="font-weight: bold;">Assistant with RAG:</div>
                {response_with_rag}
            </div>
        </div>
    <div>
    '''
    display(HTML(html_content))

In [8]:
def ask_question(query: str):
    response_without_rag, response_with_rag = run_chat_completions(query)
    display_completions_in_two_column(query, response_without_rag, response_with_rag)

## Now we can start asking questions about Tesla cars

In [9]:
ask_question('How to enable Autopilot on Tesla Model S 2018 model?')

In [10]:
ask_question('Under what circumstances that I should not use full self-driving on my Tesla car?')

In [11]:
ask_question('How to open the door of a Tesla Model S car when the power is very low?')

In [12]:
ask_question('Can I use autopilot in raining or snowing conditions?')

In [13]:
ask_question('What should I do if the touchscreen of my Tesla car is not responding?')

In [14]:
ask_question('Do I need to change oil regularly for my Tesla car?')

In [15]:
ask_question('Can I wash my Tesla car using regular car washing service?')