## Nestle HR RAG Chatbot

### Techstak: Python 3.11+ | langchain 0.3+ | chromadb | OpenAI embeddings | Gradio

## Environment variables required:

OPENAI_API_KEY 

## Action

• Import essential tools and set up OpenAI's API environment.

• Load Nestle's HR policy using PyPDFLoader and split it for easy processing.

• Create vector representations for text chunks using Chroma dB and OpenAI's
embeddings.

• Build a question-answering system using the GPT-3.5 Turbo model to retrieve answers
from text chunks.

• Create a prompt template to guide the chatbot in understanding and responding to
users.

• Use Gradio to build a user-friendly chatbot interface, enabling interaction and
information retrieval.



In [1]:
%pip install --upgrade "langchain>=0.3" langchain-community langchain_openai chromadb openai gradio pypdf dotenv tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import os
from typing import List


# LangChain imports (v0.3+)
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_openai.chat_models import ChatOpenAI
from langchain.schema import Document


In [4]:
# -------------------------------
# Config / environment
# -------------------------------
PDF_PATH = "the_nestle_hr_policy_pdf_2012.pdf" 
CHROMA_PERSIST_DIR = "./chroma_store" 
CHROMA_COLLECTION_NAME = "nestle_hr"
OPENAI_MODEL_NAME = "gpt-3.5-turbo"
EMBEDDING_MODEL_NAME = "text-embedding-3-small" 


In [5]:
# -------------------------------
# 1) Load PDF and split into chunks
# -------------------------------
def load_and_split(pdf_path: str) -> List[Document]:
    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    split_docs = splitter.split_documents(docs)
    return split_docs

In [6]:
# -------------------------------
# 2) Create embeddings and Chroma vectorstore
# -------------------------------
def build_vectorstore(docs: List[Document], persist_directory: str = CHROMA_PERSIST_DIR, collection_name: str = CHROMA_COLLECTION_NAME) -> Chroma:
    embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL_NAME)

    # If persistence directory exists and is non-empty, load from it to avoid re-embedding
    if os.path.exists(persist_directory) and os.listdir(persist_directory):
        print("Found existing Chroma persistence. Loading from disk...")
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings, collection_name=collection_name)
    else:
        print("No existing Chroma store found. Creating and persisting new vectorstore...")
        vectordb = Chroma.from_documents(
            documents=docs,
            embedding=embeddings,
            persist_directory=persist_directory,
            collection_name=collection_name
        )

    return vectordb

In [7]:
# -------------------------------
# 3) Build the QA retrieval chain with a prompt template
# -------------------------------
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

def build_retrieval_qa(vectordb: Chroma) :
    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
    llm = ChatOpenAI(model=OPENAI_MODEL_NAME)

    system_prompt = (
        "You are an expert assistant specialized in answering questions about Nestle's HR policies. "
        "Use ONLY the provided context to answer the question. If the answer cannot be found in the context, say you don't know and suggest where to look."
        "Context: {context}"
    )
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )
    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    retrieval_chain = create_retrieval_chain(retriever, question_answer_chain)

    return retrieval_chain

In [8]:
# -------------------------------
# 4) Utility to format source documents for display
# -------------------------------
def format_sources(source_docs: List[Document]) -> str:
    out = []
    for i, doc in enumerate(source_docs, 1):
        meta = doc.metadata or {}
        src = meta.get("source", "source")
        # some loaders include page numbers under 'page' key
        page_info = f" (page: {meta.get('page')})" if meta.get('page') is not None else ""
        snippet = (doc.page_content[:300] + "...") if len(doc.page_content) > 300 else doc.page_content
        out.append(f"[{i}] {src}{page_info}{snippet}")
    return "".join(out)

In [9]:
# Gradio for UI
import gradio as gr

# -------------------------------
# 5) Gradio UI: chat loop (multi-turn handled by including history in query)
# -------------------------------


def create_gradio_ui(qa_chain):
    with gr.Blocks(title="Nestle HR Chatbot (RAG)") as demo:
        gr.Markdown("# Nestle HR Chatbot")
        gr.Markdown("Ask questions about Nestle's HR policies. The assistant will search the HR PDF and answer using document excerpts.")

        chatbot = gr.Chatbot(label="Nestle HR Assistant", type="messages")
        msg = gr.Textbox(label="Your question", placeholder="Ask about leave policy, benefits, code of conduct...", lines=1)
        clear = gr.Button("Clear")

        def respond(user_message: str, history: list[dict[str, str]]) -> list[dict[str, str]]:
            if not user_message:
                return history

            # Call the Retrieval QA chain
            response = qa_chain.invoke({"input": user_message})
            answer = response["answer"] or "I'm sorry, I don't have an answer for that."
            #sources = response.get('context', [])
            #print(format_sources(sources))

            # append to history for next turns
            history.append({"role": "user", "content": user_message})
            history.append({"role": "assistant", "content": answer})

            return history

        msg.submit(respond, inputs=[msg, chatbot], outputs=chatbot)
        msg.submit(lambda: "", None, msg) # clear the input box after submit
        clear.click(lambda: [], None, chatbot)

    return demo

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# -------------------------------
# Main orchestration
# -------------------------------
if __name__ == "__main__":
    print("Loading and splitting the PDF...")
    documents = load_and_split(PDF_PATH)
    print(f"Loaded and split into {len(documents)} chunks.")

    print("Building or loading Chroma vectorstore (this may take a while the first time)...")
    vectordb = build_vectorstore(documents)
    print("Vectorstore is ready.")

    qa_chain = build_retrieval_qa(vectordb)
    print("RetrievalQA chain initialized.")

    demo = create_gradio_ui(qa_chain)
    demo.launch(server_name="0.0.0.0", share=False)

Loading and splitting the PDF...
Loaded and split into 20 chunks.
Building or loading Chroma vectorstore (this may take a while the first time)...
No existing Chroma store found. Creating and persisting new vectorstore...
Vectorstore is ready.
RetrievalQA chain initialized.
* Running on local URL:  http://0.0.0.0:7860
* To create a public link, set `share=True` in `launch()`.
