### Local RAG Part 2: Query and Retrevival from Vector Database

- Asking for user input/query
- Vector Serach and retreival of the relevant docs
- Modifications to include the used reference doc file(s) in the final responce
- Puting all together in a Gradio UI

ToDo:
later you can add more models and the option to switch models in the UI!


In [1]:
# imports

import os
import glob
from typing import List
import gradio as gr
from pathlib import Path
from dotenv import load_dotenv
import chardet  # For detecting file encoding

In [5]:
# imports for langchain

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_chroma import Chroma
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [3]:
# environment

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')

In [4]:
# price is a factor for our company, so we're going to use a low cost model

OPENAI_MODEL = "gpt-4o-mini" #"gpt-4o"
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
MODEL = "gpt-4o-mini"

# Want to keep costs ultra-low? Uncomment these lines:
# OPENAI_MODEL = "gpt-4o-mini"
# CLAUDE_MODEL = "claude-3-haiku-20240307"

db_name = "mj_vector_db"

In [6]:
def load_or_check_db(db_name: str):
    """
    Check if a Chroma database exists and load it, or inform if it doesn't exist.
    
    Args:
        db_name (str): Name of the database directory
        
    Returns:
        Chroma: The loaded vector store if it exists, None otherwise
    """
    # Initialize the embedding function
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )
    
    # Check if the database directory exists
    if os.path.exists(db_name):
        try:
            # Try to load the existing database
            vectorstore = Chroma(
                persist_directory=db_name,
                embedding_function=embeddings
            )
            print(f"Successfully loaded database with {vectorstore._collection.count()} documents")
            return vectorstore
        except Exception as e:
            print(f"Error loading database: {str(e)}")
            return None
    else:
        print(f"Database '{db_name}' does not exist.")
        return None



### Loading vector database

In [7]:
db_name = "mj_vector_db"
vectorstore = load_or_check_db(db_name)

if vectorstore is None:
    print("Please create and populate the database first.")

  embeddings = HuggingFaceEmbeddings(


Successfully loaded database with 81 documents


In [8]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


### Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [9]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_name'] for metadata in result['metadatas']]
# colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [10]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [11]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY', 'your-key-if-not-using-env')

In [12]:
def select_model(model_name="GPT"):
    # Default llm
    llm = ChatOpenAI(temperature=0.7, model_name=OPENAI_MODEL)
    if model_name=="Claude":
        llm = ChatAnthropic(
            temperature=0.7,
            model=CLAUDE_MODEL,  # or "claude-3-opus-20240229" for the larger model
            anthropic_api_key=os.environ['ANTHROPIC_API_KEY']  # Set this or use environment variable ANTHROPIC_API_KEY
        )
    elif model_name=="GPT":
        llm = ChatOpenAI(temperature=0.7, model_name=OPENAI_MODEL)
    
    return llm

In [13]:
# create a new Chat with OpenAI
llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever()

# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)


Please see the migration guide at: https://python.langchain.com/docs/versions/migrating_memory/



In [14]:
query = "Can you describe quantization in a few sentences"
result = conversation_chain.invoke({"question":query})
print(result["answer"])

Quantization is a method that reduces the precision of model parameters while maintaining the same number of parameters, which significantly decreases memory consumption. By lowering the precision from 32-bit floating point numbers to as low as 4-bit, the model can fit into limited memory without drastically affecting its performance. This approach allows larger models to be utilized in environments with restricted resources, although there is typically a slight drop in quality.


In [15]:
result

{'question': 'Can you describe quantization in a few sentences',
 'chat_history': [HumanMessage(content='Can you describe quantization in a few sentences', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Quantization is a method that reduces the precision of model parameters while maintaining the same number of parameters, which significantly decreases memory consumption. By lowering the precision from 32-bit floating point numbers to as low as 4-bit, the model can fit into limited memory without drastically affecting its performance. This approach allows larger models to be utilized in environments with restricted resources, although there is typically a slight drop in quality.', additional_kwargs={}, response_metadata={})],
 'answer': 'Quantization is a method that reduces the precision of model parameters while maintaining the same number of parameters, which significantly decreases memory consumption. By lowering the precision from 32-bit floating point numbers to

### Now modifying the code to include the source information of the text data used in responce

In [16]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

In [17]:

# Define your template with explicit mention of metadata
template = """Answer the question based on the following context and include relevant source information:

Context with Sources: {context}

Question: {question}

Please provide your answer along with the sources used:"""

def format_docs_with_metadata(docs):
    """Format documents and their metadata into a readable string"""
    formatted_docs = []
    for doc in docs:
        # Explicitly format metadata to be more readable
        metadata_str = "\n".join([f"{k}: {v}" for k, v in doc.metadata.items()])
        formatted_doc = (
            f"\n---\nContent: {doc.page_content}\n"
            f"Source Information:\n{metadata_str}\n"
            f"Keep all the Source Information and values of these keys:" + "\n".join([f"{k}" for k, v in doc.metadata.items()]) + "\n---"
        )
        formatted_docs.append(formatted_doc)
    return "\n".join(formatted_docs)

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=template
)

def get_context(input_dict):
    # You might want to increase k for more results
    docs = retriever.get_relevant_documents(input_dict["question"])
    return format_docs_with_metadata(docs)

conversation_chain = (
    {
        "context": lambda x: get_context(x),
        "question": lambda x: x["question"]
    }
    | prompt
    | llm
    | StrOutputParser()
)

# Test the chain
query = "Can you describe quantization and epoch definition in a few sentences"
result = conversation_chain.invoke({"question": query})
print(result)


The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 1.0. Use :meth:`~invoke` instead.



Quantization is a technique used to optimize machine learning models by reducing the precision of the parameters while maintaining the same number of them. For example, converting parameters from 32-bit floating-point numbers to as low as 4-bit can significantly decrease memory consumption without drastically affecting the model's performance, although there might be a slight drop in quality (source: udemy_summarizer, all_summaries.docx).

An epoch in the context of machine learning training refers to one complete pass of the entire dataset through the model. Multiple epochs allow the model to refine its accuracy by making incremental improvements with each iteration of the data, which can enhance overall performance (source: udemy_summarizer, all_summaries.docx). 

Source Information:
- doc_folder: udemy_summarizer
- doc_name: all_summaries.docx
- file_type: .docx
- source: /Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/udemy_summarizer/all_summaries.docx


## Now we will bring this up in Gradio using the Chat interface -

A quick and easy way to prototype a chat with an LLM

In [60]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain & LangChain handles the history/memory

# def chat(message, history):
    # result = conversation_chain.invoke({"question": message})
    # return result["answer"]

In [18]:
# Wrapping in a function - note that history isn't used, as the memory is in the conversation_chain 
# & LangChain handles the history/memory
def chat(message, history):
    """Chat function for Gradio interface"""
    result = conversation_chain.invoke({"question": message})
    # result is already a string, no need to access result["answer"]
    return result

#### And in Gradio:
# Create the Gradio interface
iface = gr.ChatInterface(
    fn=chat,
    type="messages",
    flagging_mode="manual",
    flagging_options=["Like", "Spam", "Inappropriate", "Other"],
    save_history=True,
    title="RAG-powered Chat Interface",
    description="Ask questions about the documents in the knowledge base."
)

# Launch the interface
iface.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [19]:
# Update the chatbot function based on the selected model
# model_dropdown.change(select_model, inputs=model_dropdown, outputs=None)
llm = select_model(model_dropdown)

conversation_chain = (
    {
        "context": lambda x: get_context(x),
        "question": lambda x: x["question"]
    }
    | prompt
    | llm
    | StrOutputParser()
)


NameError: name 'model_dropdown' is not defined

In [110]:

def chat_steram(message, history):
    """Chat function for Gradio interface"""
    result = conversation_chain.invoke({"question": message})
    for i in range(len(result)):
        time.sleep(0.05)
        yield result[: i + 1]

In [111]:
with gr.Blocks() as demo:
    model_dropdown = gr.Dropdown(
        ["GPT", "Claude", "Model 3"], label="Select Model", value="GPT"
    )
    
    # llm = select_model(model_dropdown)
    llm = model_dropdown.change(select_model, inputs=model_dropdown, outputs=None)
    # llm = ChatOpenAI(temperature=0.7, model_name=MODEL)

    # set up the conversation memory for the chat
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

    # the retriever is an abstraction over the VectorStore that will be used during RAG
    retriever = vectorstore.as_retriever()

    # putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
    # conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)
    conversation_chain = (
    {
            "context": lambda x: get_context(x),
            "question": lambda x: x["question"]
    }
        | prompt
        | llm
        | StrOutputParser()
    )


    chatbot = gr.ChatInterface(
        chat_steram,
        type="messages",
        flagging_mode="manual",
        flagging_options=["Like", "Spam", "Inappropriate", "Other"],
        save_history=True,
    )

    # Update the chatbot function based on the selected model
    # model_dropdown.change(select_model, inputs=model_dropdown, outputs=None)

if __name__ == "__main__":
    demo.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7876

To create a public link, set `share=True` in `launch()`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/.conda/lib/python3.11/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/.conda/lib/python3.11/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/.conda/lib/python3.11/site-packages/gradio/blocks.py", line 2088, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/javadmollakazemi/PycharmProjects/15_llm_engineering/llm_engineering/.conda/lib/python3.11/site-packages/gradio/blocks.py", line 1647, in call_function
    prediction =

In [None]:
with gr.Blocks() as ui:
    with gr.Row():
        python = gr.Textbox(label="Python code:", lines=10, value=python_hard)
        cpp = gr.Textbox(label="C++ code:", lines=10)
    with gr.Row():
        model = gr.Dropdown(["GPT", "Claude"], label="Select model", value="GPT")
        convert = gr.Button("Convert code")

    convert.click(optimize, inputs=[python, model], outputs=[cpp])

ui.launch(inbrowser=True)

In [None]:
import time
import gradio as gr

def slow_echo(message, history):
    for i in range(len(message)):
        time.sleep(0.05)
        yield "You typed: " + message[: i + 1]

demo = gr.ChatInterface(
    slow_echo,
    type="messages",
    flagging_mode="manual",
    flagging_options=["Like", "Spam", "Inappropriate", "Other"],
    save_history=True,
)

if __name__ == "__main__":
    demo.launch()


In [21]:
import time
import gradio as gr

def slow_echo(message, history):
    for i in range(len(message)):
        time.sleep(0.05)
        yield "You typed: " + message[: i + 1]

def select_model(model_name):
    # This function can be used to select the model based on the dropdown value
    # For now, it just returns the selected model name
    return model_name

with gr.Blocks() as demo:
    model_dropdown = gr.Dropdown(
        ["GPT", "Claude", "Model 3"], label="Select Model", value="GPT"
    )
    chatbot = gr.ChatInterface(
        slow_echo,
        type="messages",
        flagging_mode="manual",
        flagging_options=["Like", "Spam", "Inappropriate", "Other"],
        save_history=True,
    )

    # Update the chatbot function based on the selected model
    model_dropdown.change(select_model, inputs=model_dropdown, outputs=None)

if __name__ == "__main__":
    demo.launch(show_error=True)

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.



A function (select_model) returned too many output values (needed: 0, returned: 1). Ignoring extra values.
    Output components:
        []
    Output values returned:
        ["Claude"]


A function (select_model) returned too many output values (needed: 0, returned: 1). Ignoring extra values.
    Output components:
        []
    Output values returned:
        ["GPT"]



In [20]:
from langchain_anthropic import ChatAnthropic
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
import gradio as gr

# Initialize the LLM
llm = ChatAnthropic(
    temperature=0.7,
    model=CLAUDE_MODEL
)

# Initialize memory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key="answer"  # Specify which output to store in memory
)

retriever = vectorstore.as_retriever()


# Create the conversation chain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    verbose=True
)

# Define the chat function for Gradio
def chat(message, history):
    result = conversation_chain({"question": message})
    return result["answer"]

# Create Gradio interface
iface = gr.ChatInterface(
    fn=chat,
    title="RAG-powered Chat Interface",
    description="Ask questions about the documents in the knowledge base."
)

# Launch the interface
iface.launch()


The 'tuples' format for chatbot messages is deprecated and will be removed in a future version of Gradio. Please set type='messages' instead, which uses openai-style 'role' and 'content' keys.



* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


