# **Step 1: Using pysqliste3 instead of sqlite3**

In [21]:
import sys
import pysqlite3
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

# **Step 2: Importing the Necessary Modules**

a) Importing from langchain framework modules for loading pdf file, splitting the text, Chroma DB for storing text in embedding form, prompt templates,
LLM Chain for chaining prompts and ChatOpenAI for chat conversion.  
b) Importing OS module for reading environment variable from localhost.  
c) Importing Openai for embeddings module.  
d) Importing dotenv for loading environment variable.  
e) Importing gradio for generating user friendly user interface for passing query to chatbot and for AI response.  


In [22]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
import os
import openai
from dotenv import load_dotenv, find_dotenv
import gradio as gr

# **Step 3: Setting Open API Key**

Loading environment variables for OpenAI API key. This key is required to authenticate the request while making a call to OpenAI for embedding model.  

In [23]:
_ = load_dotenv(find_dotenv())  # Read local .env file
openai.api_key = os.getenv('OPENAI_API_KEY')

# **Step 4: Loading PDFs Using PyPDFLoader**

Loading the dataset. In this case, the Nestle HR Policy document. Then splitting based on pages and printing page 0.  

In [24]:
pdf_loader = PyPDFLoader("the_nestle_hr_policy_pdf_2012.pdf")
pdf_pages = pdf_loader.load_and_split()
print(pdf_pages[0])  

page_content='Policy
MandatorySeptember   2012
The Nestlé  
Human Resources Policy' metadata={'source': 'the_nestle_hr_policy_pdf_2012.pdf', 'page': 0}


# **Step 5: Spliting the Documents Using RecursiveCharacterTextSplitter**

Splitting text into chucks. This helps in keeping the token within the max length of different transformer models. This also increases performance by parallel processing.  
Here fixing the chuck size to 1024 and having little overlap.  

In [26]:
#Spliting text into chunks for processing
doc_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
split_texts = doc_splitter.split_documents(pdf_pages)
#Printing length of the split texts
print(len(split_texts))

20


# **Step 6: Embedding the Documents Using OpenAIEmbeddings and Print the Length of the Embedding**

Using OpenAI Embeddings for the text  

In [27]:
# Embedding the text using OpenAI embeddings
text = split_texts[0].page_content
openai_embed = OpenAIEmbeddings()
openai_embed_result = openai_embed.embed_documents([text])
#Printing the length of the embedding after text is embedded. 
print(len(openai_embed_result[0]))

1536


# **Step 7: Creating a Chroma Instance**

In [28]:
# Creating a vector store using Chroma and adding embeddings
chroma_db = Chroma.from_documents(split_texts, openai_embed)

# **Step 8: Defining Prompt Template**

Defining a prompt template.  
Here asking AI to respond to the query that is asked by user based on the text supplied to it. 
The text is obtained from Chroma VectorDB based on query asking search matching technique and then it is passed to AI. This text serves as the content for the AI chatbot.  

In [29]:
chat_template = """
You are a HR manager of a company. You will be provided with the text. A question will be asked. 
Your task is to answer the question based on the text provided. 

TEXT: {text}

QUESTION: {question}

ANSWER:
"""

#Passing 'text' and 'question' as input variabled for the prompt. The text is the search content obtained from ChromaDB and question is the query asked by user.
prompt = PromptTemplate(input_variables=["text", "question"], template=chat_template)

# **Step 9: Initializing the Chatbot using ChatOpenAI**

In [30]:
# Initializing ChatOpenAI model, using 'GPT-3.5-Turbo
chat_model = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=openai.api_key)

# **Step 10: Defining the chain with the prompt and the model**

In [31]:
# Defining the chain with the prompt and model
scenario_chain = LLMChain(llm=chat_model, prompt=prompt)

# **Step 11: Creating OpenAI Chatbot response**

A function to perform a similarity search on Chrome DB to obtain the query response. The top 2 results based on probability score are picked here and returned. 

In [32]:
def chatbot_response(query):
    # Performing a similarity search in Chroma DB
    search_results = chroma_db.similarity_search(query, k=2)
    if search_results:
        relevant_text = " ".join([result.page_content for result in search_results])
    else:
        relevant_text = "No relevant information found."
    
    # Using the chain to generate an answer based on the retrieved text
    response = scenario_chain.run({"text": relevant_text, "question": query})
    return response

# **Step 12: Integrating with Gradio interface**

Integrating with Gardio. This creates a user friendly interface, where user can type their question in input field and AI chatbot share the response in output field.  

In [None]:
# Creating Gradio interface
iface = gr.Interface(
    fn=chatbot_response, 
    inputs="text", 
    outputs="text", 
    title="Nestle HR Policy Chatbot",
    description="Ask questions about Nestle's HR policies and get answers based on the document!"
)

# Launching the Gradio app. The 'share' parameter is used to generate public sharable URL (active for 72 hours) since this project is done in online lab. 
iface.launch(share=True)