In [1]:
import gradio as gr
import openai
import requests
from PIL import Image
from io import BytesIO
import getpass

In [29]:
openai.api_key = "your-api-key"

In [4]:
# To read PDF files
from langchain.document_loaders import PyPDFLoader

In [5]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# ChatOpenAI: for using OpenAI's chat models like GPT-3.5 Turbo.
# OpenAIEmbeddings: for generating embeddings from text using OpenAI's models.

In [6]:
from langchain.vectorstores import FAISS
# FAISS: for efficient similarity search and clustering of dense vectors.

In [7]:
# To create divided text chunks using a text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
pdf_path = "Dataset/the_nestle_hr_policy_pdf_2012.pdf"

# Step 3: Load the PDF document using PyPDFLoader
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Print the number of documents loaded
print(f"Number of documents loaded: {len(documents)}")

Number of documents loaded: 8


In [9]:
# Split PDF text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

# Print the number of text chunks created
print(f"Number of text chunks created: {len(texts)}")

Number of text chunks created: 20


In [10]:
# Step 4: Create embeddings for each chunk using OpenAI
embeddings = OpenAIEmbeddings(api_key=openai.api_key) # text-embedding-ada-002
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key=openai_api_key)

# Print the embedding model being used
print(f"Using OpenAI embedding model: {embeddings.model}") # Display the model being used for embeddings

Using OpenAI embedding model: text-embedding-ada-002


In [11]:
# Store chunks in FAISS vector store
vectorstore = FAISS.from_documents(texts, embeddings)

In [21]:
# Initialize GPT-4 Turbo via LangChain
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.3,
    api_key=openai.api_key
)

In [22]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.schema import messages_from_dict, messages_to_dict

In [23]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [24]:
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    memory=memory
)

In [25]:
def nestle_chatbot_reply(message, history):
    if not message:
        return history, ""

    if message.strip().lower() == "clear":
        memory.clear()
        return [], ""

    response = qa_chain.invoke({"question": message})
    answer = response["answer"]

    # history format: list of {"role": ..., "content": ...}
    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": answer})
    return history, ""

In [26]:
# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## ðŸ¤– HR Document Chatbot")
    chatbot = gr.Chatbot(type = "messages")
    msg = gr.Textbox(placeholder="Ask a question about HR policy...")
    send_btn = gr.Button("Send")
    clear_btn = gr.Button("Clear Chat")

    msg.submit(nestle_chatbot_reply, inputs=[msg, chatbot], outputs=[chatbot, msg])
    send_btn.click(nestle_chatbot_reply, inputs=[msg, chatbot], outputs=[chatbot, msg])
    clear_btn.click(lambda: ([], ""), None, outputs=[chatbot, msg])

demo.launch()

* Running on local URL:  http://127.0.0.1:7874
* To create a public link, set `share=True` in `launch()`.


