In [2]:

# ------------------ Configuration ------------------
# You can change these defaults as needed.

# Model to use — assignment requests GPT-3.5 Turbo, but you may use a newer model if you wish.
# Keep it configurable:
#OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-3.5-turbo")

# Chroma persistence dir (so the index can be reused without rebuilding)
CHROMA_DIR = Path(".Dataset/the_nestle_hr_policy_pdf_2012.pdf")
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

# Chunking parameters (tune for your documents)
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

# Retriever params
SEARCH_K = 1


In [3]:

def build_knowledge_base(pdf_paths, reset_db=False):
    '''
    Create (or refresh) a Chroma vector store from a list of PDF file paths.
    Returns a retriever you can plug into RetrievalQA.
    '''
    if reset_db and CHROMA_DIR.exists():
        shutil.rmtree(CHROMA_DIR)
        CHROMA_DIR.mkdir(parents=True, exist_ok=True)

    # 1) Load PDFs
    docs = []
    for p in pdf_paths:
        loader = PyPDFLoader(str(p))
        docs.extend(loader.load())

    # 2) Split into chunks
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, separators=['\n\n', '\n', ' ', '']
    )
    chunks = splitter.split_documents(docs)
    #  Visualize
    for i, chunk in enumerate(chunks[:3]):
        print(f"Chunk {i+1}:\n{chunk.page_content}\n{'-'*30}")

    # 3) Embeddings + Chroma
    embeddings = OpenAIEmbeddings()
    vectordb = Chroma(
        collection_name="nestle_hr_collection",
        embedding_function=embeddings,
        persist_directory=str(CHROMA_DIR),
    )
    # Add & persist
    vectordb.add_documents(chunks)
    vectordb.persist()

    # 4) Return retriever
    return vectordb.as_retriever(search_kwargs={"k": SEARCH_K})


In [4]:

SYSTEM_PROMPT = '''You are a helpful assistant for Nestlé HR documents.
Answer strictly and only from the provided context.
If the answer is not in the context, reply: "I could not find that in the HR documents."
Be concise, cite page numbers when available, and include bullet points where helpful.
'''

QA_TEMPLATE = '''
{system_prompt}

Context:
{context}

Question: {question}

Guidelines:
- If multiple policies conflict, state that and summarize both.
- Prefer the latest policy if effective dates are shown.
- Include page numbers from the source when available (e.g., "Policy PDF p. 12").
- Keep answers factual and free of speculation.

Answer:
'''

prompt = PromptTemplate(
    template=QA_TEMPLATE,
    input_variables=["system_prompt", "context", "question"]
)

def make_qa_chain(retriever):
    llm = ChatOpenAI(model=OPENAI_CHAT_MODEL, temperature=0)
    # LangChain RetrievalQA "stuff" chain_type will stuff top-k docs into the prompt
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt, "verbose": False},
        return_source_documents=True,
    )
    return chain


In [None]:
"""
Nestle HR Reports Chatbot
-------------------------
End-to-end app:
- Upload PDFs (Nestlé HR policies/reports)
- Build Chroma vector DB with OpenAI embeddings
- Ask questions via a RetrievalQA chain (RAG)
- Gradio UI

Setup:
    pip install -U langchain langchain-community langchain-openai chromadb pypdf gradio python-dotenv

Environment:
    Create .env with:
        OPENAI_API_KEY="sk-..."
    (Optional) tweak:
        OPENAI_CHAT_MODEL="gpt-3.5-turbo"
"""

import os
import shutil
import traceback
from pathlib import Path

from dotenv import load_dotenv

# LangChain core bits
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# UI
import gradio as gr


# ========= 0) Config & Environment =========
load_dotenv()  # loads OPENAI_API_KEY from .env if present
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise RuntimeError(
        "OPENAI_API_KEY not set. Create a .env with OPENAI_API_KEY or export it in your shell."
    )

# Model (assignment requests GPT-3.5 Turbo)
OPENAI_CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-3.5-turbo")

# Chroma persistence
CHROMA_DIR = Path("./chroma_nestle_hr")
CHROMA_DIR.mkdir(parents=True, exist_ok=True)

# Chunking
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 200

# Retrieval
SEARCH_K = 5


# ========= 1) Helpers: load & split PDFs =========
def load_and_split_pdfs(pdf_paths):
    """
    Load PDFs into LangChain Documents, then split into overlapping chunks.
    """
    docs = []
    for p in pdf_paths:
        loader = PyPDFLoader(str(p))
        docs.extend(loader.load())

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", " ", ""],
    )
    chunks = splitter.split_documents(docs)
    return chunks


# ========= 2) Build Chroma Vector DB with OpenAI embeddings =========
def build_vector_store(chunks, persist_dir: Path, reset_db: bool = False):
    """
    Create (or open) a Chroma store and add chunk embeddings.
    """
    if reset_db and persist_dir.exists():
        shutil.rmtree(persist_dir)
        persist_dir.mkdir(parents=True, exist_ok=True)

    # OpenAI embeddings (use a cost-efficient default)
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    vectordb = Chroma(
        collection_name="nestle_hr_collection",
        embedding_function=embeddings,
        persist_directory=str(persist_dir),
    )

    # Add documents in small batches (gentler on rate limits)
    batch_size = 64
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        vectordb.add_documents(batch)

    vectordb.persist()
    return vectordb


# ========= 3) Prompt + RetrievalQA chain =========
SYSTEM_PROMPT = (
    "You are a helpful assistant for Nestlé HR documents. "
    "Answer strictly and only from the provided context. "
    "If the answer is not in the context, reply: 'I could not find that in the HR documents.' "
    "Be concise, cite page numbers when available, and use bullet points if helpful."
)

QA_TEMPLATE = """{system_prompt}

Context:
{context}

Question: {question}

Guidelines:
- If multiple policies conflict, state that and summarize both.
- Prefer the latest policy if effective dates are shown.
- Include page numbers from the source when available (e.g., "Policy PDF p. 12").
- Keep answers factual and free of speculation.

Answer:
"""

prompt = PromptTemplate(
    template=QA_TEMPLATE,
    input_variables=["context", "question"],
    partial_variables={"system_prompt": SYSTEM_PROMPT},
)


def make_qa_chain(vectordb, k=SEARCH_K):
    retriever = vectordb.as_retriever(search_kwargs={"k": k})
    llm = ChatOpenAI(model=OPENAI_CHAT_MODEL, temperature=0)
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return chain


# ========= 4) Gradio UI callbacks =========
def _extract_paths(files):
    """
    Convert gr.Files input to a list of Path objects.
    """
    paths = []
    for f in files or []:
        p = getattr(f, "name", None) or getattr(f, "path", None)
        if not p:
            raise ValueError("Could not resolve a filesystem path for an uploaded file.")
        paths.append(Path(p))
    return paths


def build_index_ui(files, reset_db):
    """
    Build the vector index from uploaded PDFs.
    Returns (status_text, qa_chain or None).
    """
    try:
        pdf_paths = _extract_paths(files)
        if not pdf_paths:
            return "Please upload at least one PDF.", None

        chunks = load_and_split_pdfs(pdf_paths)
        if not chunks:
            return "No text extracted from PDFs. Are they scanned images with no OCR?", None

        vectordb = build_vector_store(chunks, CHROMA_DIR, reset_db=reset_db)
        qa_chain = make_qa_chain(vectordb)
        return f" Index built with {len(pdf_paths)} file(s) and {len(chunks)} chunks. You can start asking questions.", qa_chain
    except Exception as e:
        tb = traceback.format_exc(limit=3)
        return f" Failed to build index:\n{e}\n\n{tb}", None


def answer_question(message, history, qa_chain):
    """
    Run RetrievalQA for a user question and format sources.
    """
    if qa_chain is None:
        return "Please build the index first by uploading PDFs and clicking 'Build Index'."

    res = qa_chain({"query": message})
    answer = res["result"]

    # Append sources (filename & page)
    try:
        src_lines = []
        for d in res.get("source_documents", []):
            src = d.metadata.get("source", "PDF")
            page = d.metadata.get("page")
            src_lines.append(f"- {Path(src).name}, page {page + 1 if page is not None else '?'}")
        if src_lines:
            answer += "\n\n**Sources:**\n" + "\n".join(src_lines)
    except Exception:
        pass

    return answer


# ========= 5) Build Gradio app =========
def make_app():
    with gr.Blocks(title="Nestlé HR Chatbot") as demo:
        gr.Markdown("# Nestlé HR Reports Chatbot")
        gr.Markdown("Upload Nestlé HR policy/reports (PDF), build the index, and ask questions.")

        with gr.Row():
            files = gr.Files(file_types=[".pdf"], label="Upload Nestlé HR PDFs")
        with gr.Row():
            reset = gr.Checkbox(
                label="Rebuild index from scratch (clears existing Chroma store)", value=False
            )
            build = gr.Button("Build Index", variant="primary")
        status = gr.Markdown("")

        chatbot = gr.Chatbot(label="Chat with Nestlé HR Documents")
        question = gr.Textbox(label="Ask a question")
        qa_state = gr.State(value=None)  # persist the RetrievalQA chain between events

        def add_user_msg(user_msg, chat_history):
            if not user_msg:
                return "", chat_history
            return "", chat_history + [[user_msg, None]]

        def add_bot_msg(chat_history, qa_chain):
            user_msg = chat_history[-1][0]
            reply = answer_question(user_msg, chat_history, qa_chain)
            chat_history[-1][1] = reply
            return chat_history

        # Build index -> status text + store qa_chain in state
        build.click(build_index_ui, inputs=[files, reset], outputs=[status, qa_state])

        # Chat flow
        question.submit(
            add_user_msg, inputs=[question, chatbot], outputs=[question, chatbot], queue=False
        ).then(
            add_bot_msg, inputs=[chatbot, qa_state], outputs=[chatbot]
        )

    return demo


if __name__ == "__main__":
    app = make_app()
    # Local-only:
    app.launch(server_name="127.0.0.1", server_port=7860, share=False)
    # Or LAN access (other devices on your Wi-Fi):
    # app.launch(server_name="0.0.0.0", server_port=7860, share=False)
    # Or auto-open browser:
    # app.launch(inbrowser=True)
    # Or temporary public link:
    # app.launch(share=True)


  chatbot = gr.Chatbot(label="Chat with Nestlé HR Documents")


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


  res = qa_chain({"query": message})


In [8]:
# To run locally, uncomment:
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

* Running on local URL:  http://0.0.0.0:7860
* To create a public link, set `share=True` in `launch()`.




Chunk 1:
Policy
Mandatory
September  2012
The Nestlé  
Human Resources Policy
------------------------------
Chunk 2:
Policy
Mandatory
September 
 20
12
Issuing departement
Hum
an Resources
Target audience 
All
 employees
Approver
Executive Board, Nestlé S.A.
Repository
All Nestlé Principles and Policies, Standards and  
Guidelines can be found in the Centre online repository at:  
http://intranet.nestle.com/nestledocs
Copyright
 and confidentiality
Al
l rights belong to Nestec Ltd., Vevey, Switzerland.
© 2012, Nestec Ltd.
Design
Nestec Ltd., Corporate Identity & Design,  
Vevey, Switzerland
Production
------------------------------
Chunk 3:
Vevey, Switzerland
Production
brain’print GmbH, Switzerland
Paper
This report is printed on BVS, a paper produced  
from well-managed forests and other controlled sources  
certified by the Forest Stewardship Council (FSC).
------------------------------


  vectordb = Chroma(
  vectordb.persist()
Traceback (most recent call last):
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/queueing.py", line 626, in process_events
    response = await route_utils.call_process_api(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/route_utils.py", line 350, in call_process_api
    output = await app.get_blocks().process_api(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/blocks.py", line 2250, in process_api
    result = await self.call_function(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/blocks.py", line 1757, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/User

Chunk 1:
Policy
Mandatory
September  2012
The Nestlé  
Human Resources Policy
------------------------------
Chunk 2:
Policy
Mandatory
September 
 20
12
Issuing departement
Hum
an Resources
Target audience 
All
 employees
Approver
Executive Board, Nestlé S.A.
Repository
All Nestlé Principles and Policies, Standards and  
Guidelines can be found in the Centre online repository at:  
http://intranet.nestle.com/nestledocs
Copyright
 and confidentiality
Al
l rights belong to Nestec Ltd., Vevey, Switzerland.
© 2012, Nestec Ltd.
Design
Nestec Ltd., Corporate Identity & Design,  
Vevey, Switzerland
Production
------------------------------
Chunk 3:
Vevey, Switzerland
Production
brain’print GmbH, Switzerland
Paper
This report is printed on BVS, a paper produced  
from well-managed forests and other controlled sources  
certified by the Forest Stewardship Council (FSC).
------------------------------


Traceback (most recent call last):
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/queueing.py", line 626, in process_events
    response = await route_utils.call_process_api(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/route_utils.py", line 350, in call_process_api
    output = await app.get_blocks().process_api(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/blocks.py", line 2250, in process_api
    result = await self.call_function(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/gradio/blocks.py", line 1757, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/Users/deepthi/.pyenv/versions/voicebot-env/lib