In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import statistics
import re


In [2]:
pdf_path = "Company Policies.pdf"

loader = PyPDFLoader(pdf_path)
documents = loader.load()

print(f"Total pages loaded: {len(documents)}")
print(documents[0].page_content[:1000])


Total pages loaded: 34
Company Policies 
Company Name: NovaCart Technologies Pvt. Ltd. 
Corporate Office: [Address to be added] 
Customer Support Email: support@novacart.com 
Customer Support Phone: +91-XXXX-XXXXXX 
Website: www.novacart.com 
Last Updated: January 2026 
Effective Date: January 1, 2026 
 
Table of Contents 
1. Refund Policy 
2. Cancellation Policy 
3. Shipping Policy 
4. Exchange Policy 
5. Product Warranty Policy 
6. Customer Support 
7. Privacy and Data Protection 
8. Terms and Conditions 
9. Dispute Resolution 
10. Policy Limitations and Updates 
 
1. Refund Policy 
1.1 Eligibility for Refunds 
Customers may request a full refund for eligible products within 14 calendar days from the date of 
delivery. The refund window begins on the day the product is delivered to the customer's registered 
address. 
Conditions for Refund Eligibility 
To qualify for a refund, all of the following conditions must be met: 
• Unused Condition: The product must be completely unused, unw

Document Specific Chunking

In [3]:
full_text = "\n".join(doc.page_content for doc in documents)

import re

SECTION_REGEX = re.compile(
    r"\n(?=\d+\.\s+[A-Z][A-Za-z\s]+Policy)"
)

sections = re.split(SECTION_REGEX, full_text)

print(f"Total policy sections found: {len(sections)}")


Total policy sections found: 11


In [4]:
section_docs = []

for section in sections:
    cleaned = section.strip()
    if len(cleaned) > 200:  # ignore junk
        section_docs.append(
            Document(
                page_content=cleaned,
                metadata={"source": "company_policies"}
            )
        )

print(f"Usable policy sections: {len(section_docs)}")


Usable policy sections: 6


In [5]:
encoding = tiktoken.get_encoding("cl100k_base")

def count_tokens(text: str) -> int:
    return len(encoding.encode(text))


In [6]:
section_token_sizes = [count_tokens(doc.page_content) for doc in section_docs]

print("Section token stats:")
print("Min:", min(section_token_sizes))
print("Avg:", round(statistics.mean(section_token_sizes)))
print("Median:", round(statistics.median(section_token_sizes)))
print("75th percentile:", round(statistics.quantiles(section_token_sizes, n=4)[2]))
print("Max:", max(section_token_sizes))


Section token stats:
Min: 78
Avg: 1513
Median: 1496
75th percentile: 2101
Max: 3238


Evaluating Chunk Size for Large Sections

In [7]:
p75 = statistics.quantiles(section_token_sizes, n=4)[2]

CHUNK_SIZE = min(round(p75), 600)
CHUNK_OVERLAP = round(CHUNK_SIZE * 0.2)

CHUNK_SIZE, CHUNK_OVERLAP


(600, 120)

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,        # 600
    chunk_overlap=CHUNK_OVERLAP,  # 120
    separators=["\n\n", "\n", ".", " ", ""]
)


In [9]:
chunked_docs = []

for doc in section_docs:
    token_count = count_tokens(doc.page_content)

    if token_count <= CHUNK_SIZE:
        # Keep small sections as-is
        chunked_docs.append(doc)
    else:
        # Recursively split large sections
        splits = text_splitter.split_text(doc.page_content)
        for split in splits:
            chunked_docs.append(
                Document(
                    page_content=split,
                    metadata=doc.metadata
                )
            )

print(f"Total chunks created: {len(chunked_docs)}")


Total chunks created: 90


In [10]:
chunk_sizes = [count_tokens(doc.page_content) for doc in chunked_docs]

print("Chunk size stats:")
print("Min:", min(chunk_sizes))
print("Avg:", round(statistics.mean(chunk_sizes)))
print("Max:", max(chunk_sizes))


Chunk size stats:
Min: 46
Avg: 120
Max: 169


Embedding

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}  # important for cosine similarity
)


  embedding_model = HuggingFaceEmbeddings(


In [12]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(
    documents=chunked_docs,
    embedding=embedding_model
)

print("Total vectors stored:", vectorstore.index.ntotal)


Total vectors stored: 90


In [13]:
import os
from dotenv import load_dotenv

load_dotenv()


os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")


In [14]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0.0  # critical for hallucination control
)


In [36]:
RETRIEVAL_K = 5
RELATIVE_THRESHOLD = 0.6   # keep chunks close to the best one
MIN_SCORE_FLOOR = 0.2      # safety floor to avoid junk retrievals


In [37]:
def retrieve_with_gating(query: str):
    results = vectorstore.similarity_search_with_score(
        query,
        k=RETRIEVAL_K
    )

    if not results:
        return []

    best_score = results[0][1]

    # If even the best result is weak, refuse early
    if best_score < MIN_SCORE_FLOOR:
        return []

    filtered = [
        (doc, score)
        for doc, score in results
        if score >= best_score * RELATIVE_THRESHOLD
    ]

    return filtered


In [38]:
from langchain_core.prompts import PromptTemplate



In [39]:
rag_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a policy question-answering assistant.

Rules (must be followed strictly):
1. Answer using ONLY the information provided in the Context.
2. If the Context contains partial information, answer using only what is stated and clearly
   mention any missing details.
3. If the Context contains no relevant information, say:
   "I cannot answer this question based on the provided documents."
4. Do NOT use prior knowledge.
5. Do NOT guess or infer beyond the text.
6. Be concise, factual, and neutral.

Context:
{context}

Question:
{question}

Answer:
"""
)



In [40]:
REFUSAL = "I cannot answer this question based on the provided documents."

def rag_answer(query: str):
    retrieved = retrieve_with_gating(query)

    if not retrieved:
        return REFUSAL

    context_blocks = []
    for doc, score in retrieved:
        context_blocks.append(
            f"[Relevance Score: {round(score, 2)}]\n{doc.page_content}"
        )

    context = "\n\n---\n\n".join(context_blocks)

    prompt = rag_prompt.format(
        context=context,
        question=query
    )

    response = llm.invoke(prompt)
    return response.content


Easy Questions

In [41]:
print(
    rag_answer("What are the delivery timelines for express and standard shipping in the Asia-Pacific region")
)


The delivery timelines for express and standard shipping in the Asia-Pacific region are: 
• Express: 3-5 business days 
• Standard: 7-10 business days


In [43]:
print(
    rag_answer("Do you offer same-day delivery?")
)

Yes, same-day delivery is available as a pilot program in select metro areas, but it has certain conditions: 
- The order must be placed before 12 PM.
- It is limited to specific product categories.
- Additional charges of ₹150-500 apply.


Some partially answerable

In [44]:
print(
    rag_answer("What factors can delay international shipping orders?")
)

Based on the provided Context, the factors that can delay international shipping orders are:

1. Customs Clearance: International orders (typically 1-5 days)
2. Courier Delays: Third-party logistics partner delays
3. Address Issues: Incomplete or incorrect delivery information
4. High Demand Periods: Sale events, festive seasons
5. COVID-19 or Health Emergencies: Lockdowns, movement restrictions
6. Remote Locations: Accessibility and infrastructure challenges
7. Natural Disasters: Floods, earthquakes, cyclones 

Additionally, customs delays can occur if customers do not provide accurate product information for customs declaration, do not respond promptly to customs queries, or do not pay applicable duties to release shipment.


In [45]:
print(
    rag_answer("What delivery options are available for customers in remote locations?")
)

For customers in remote locations, delivery is available through partnered courier services. However, it may require additional shipping charges (₹50-150) and the delivery timeline is 7-10 business days. Additionally, some products may not be eligible for delivery in these areas.


Some unanswerable

In [None]:
print(
    rag_answer("Which metro cities are included in the same-day delivery pilot program?")
)

I cannot answer this question based on the provided documents. The Context mentions that same-day delivery is available in "select metro areas", but it does not specify which metro cities are included in the pilot program.


In [47]:
print(
    rag_answer("Does NovaCart offer carbon-neutral or eco-friendly shipping options?")
)

I cannot answer this question based on the provided documents.
