In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [48]:
file_path = "E:/chatbot/Chatbot/dataset/agent_ai.pdf"

In [49]:
loader = PyPDFLoader(file_path=file_path)

In [50]:
load = loader.load()

In [51]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap= 20
)

splitted_text = text_splitter.split_documents(load)

In [52]:
splitted_text[:1] 

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.1 (Windows)', 'creationdate': '2025-02-14T12:31:53+05:00', 'moddate': '2025-02-14T12:33:59+05:00', 'trapped': '/False', 'source': 'E:/chatbot/Chatbot/dataset/agent_ai.pdf', 'total_pages': 93, 'page': 1, 'page_label': '2'}, page_content='2\nMastering AI Agents\nPreface')]

In [53]:
#Embeddings

In [54]:
embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

In [55]:
embed = HuggingFaceEmbeddings(model=embedding_model)

In [56]:
final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)

In [57]:
len(final_embed)

8498

In [58]:
collection = final_embed._collection
results = collection.get(include=["embeddings", "documents"])

In [59]:
len(results['documents'])

8498

In [60]:
results.keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])

#### Now the Functions will be made for the later use  

In [61]:
def load_pdf(file_path):
    loader = PyPDFLoader(file_path=file_path)
    load = loader.load()

    return load

In [62]:
def splitting(load):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap= 20)

    splitted_text = text_splitter.split_documents(load)

    return splitted_text

In [63]:
def embeds(load):
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    embed = HuggingFaceEmbeddings(model=embedding_model)
    
    final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)
    return final_embed
    
    

In [64]:
def custom_embeds(load):
    embedding_model1 = input("Please provide the model for embedding which you want to load: ")
    embedding_model = embedding_model1
    embed = HuggingFaceEmbeddings(model=embedding_model)
    
    final_embed = Chroma.from_documents(
    documents=splitted_text,
    embedding=embed,
    collection_name="pdfs",
    persist_directory="E:/PDF_ChatBot/chroma_langchain_db"
)
    return final_embed
    
    

In [65]:
from dotenv import load_dotenv
import streamlit as st

In [66]:
load_dotenv()

True

In [67]:
if not os.getenv("GOOGLE_API_KEY"):
    st.error("Err")
    st.stop

In [68]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

In [69]:
#Setting up the brain

In [70]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=1, 
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [71]:
#For retrieval

In [72]:
ret = final_embed.as_retriever(search_kwargs = {"k" : 2})

In [73]:
ret

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000014692C0AA50>, search_kwargs={'k': 2})

In [74]:
#connecting the brain and retrival for the responses

In [None]:
template = """
You are an expert and professional in reading PDFs. 

Guidelines:
2. Keep your answer clear, structured, and easy to read.
3. Use bullet points if listing multiple items.
4. Do not make up information.

Context:
{context}

Question:
{question}

Helpful Answer:

"""

In [87]:
my_prompt = PromptTemplate(
    template=template,
    input_variables= ["context" , "question"]
    
)

In [77]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = ret,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : my_prompt}
)


In [90]:
x = qa_chain.invoke({"query" : "what document is this"})

In [91]:
print(x["result"])

Based on the context provided ("complex document hierarchies"), the document in question is likely a **technical or legal document** with nested structures, such as:  
- **Software documentation** (e.g., API guides, technical manuals with layered sections).  
- **Legal contracts** (e.g., statutes, regulations with clauses, subclauses, and annotations).  
- **Academic papers** (e.g., theses with hierarchical sections like chapters, subsections, and appendices).  
- **Project management documents** (e.g., scope statements, SOPs with multi-level workflows).  

### Key Characteristics:  
- **Nested structures**: Sections, subsections, bullet points, or tables of contents.  
- **Formal language**: Precise terminology and standardized formatting.  
- **Purpose**: Clarity in complex information (e.g., compliance, technical specifications).  

---

### Job Portals for Roles Involving Document Hierarchies:  
If youâ€™re seeking jobs related to managing, creating, or analyzing such documents (e.

In [80]:
from langchain_openai import ChatOpenAI

In [81]:
OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")

In [88]:
llm1 = ChatOpenAI(
        model="alibaba/tongyi-deepresearch-30b-a3b:free",
        openai_api_base="https://openrouter.ai/api/v1", 
        openai_api_key=OPENROUTER_API_KEY,              
        temperature=0.2,                                
        max_retries=3,
    )

In [89]:
qa_chain = RetrievalQA.from_chain_type(
    llm = llm1,
    chain_type = "stuff",
    retriever = ret,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : my_prompt}
)
