In [1]:
import os
os.chdir("../")
%pwd

'c:\\Users\\NIKHIL GUPTA\\Desktop\\projects\\medibot'

In [2]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="**/*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [27]:
extracted_data = load_pdf_file(data="data/")

In [30]:
def text_splitter(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=60,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [31]:
text_chunks = text_splitter(extracted_data)
len(text_chunks)

462

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [33]:
embeddings = download_hugging_face_embeddings()

In [34]:
query = "Who is the prime minister of India?"
query_result = embeddings.embed_query(query)
len(query_result)

384

In [None]:
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,          # embedding dimension
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

In [None]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [51]:
GOOGLE_API_KEY=os.environ.get("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
from langchain_community.vectorstores import Pinecone as PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name="rag",
    embedding=embeddings 
   
)

In [59]:
retriever = docsearch.as_retriever(search_type = "similarity",search_kwargs={"k":3})

In [60]:
results = retriever.invoke("who is the Governor of jammu and kashmir?", k=3)

In [None]:
results

[Document(metadata={'author': 'hp', 'creationdate': '2025-09-02T16:28:34+05:30', 'creator': 'Microsoft® Word 2019', 'moddate': '2025-09-02T16:28:59+05:30', 'page': 43.0, 'page_label': '44', 'producer': 'Microsoft® Word 2019', 'source': 'data\\The-Hindu-Review-August.pdf', 'total_pages': 44.0}, page_content='62 Sikkim CM – Prem Singh Tamang (Golay); Governor – Lakshman Prasad Acharya \n63 Chhattisgarh CM – Vishnu Deo Sai (2023–); Governor – Biswabhusan Harichandan (not Khambhapati) \n64 Jammu & Kashmir (UT) Lt. Governor – Manoj Sinha (no CM currently) \n65 Madhya Pradesh CM – Mohan Yadav (2023–); Governor – Mangubhai Patel \n66 Tamil Nadu CM – M.K. Stalin; Governor – R. N. Ravi \n67 Maharashtra CM – Eknath Shinde (not Devendra Fadnavis, he is Dy CM); Governor – Ramesh Bais (not \nRadhakrishnan)'),
 Document(metadata={'author': 'hp', 'creationdate': '2025-09-02T16:28:34+05:30', 'creator': 'Microsoft® Word 2019', 'moddate': '2025-09-02T16:28:59+05:30', 'page': 43.0, 'page_label': '44', 'p

In [61]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.2
)

In [62]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [63]:
system_prompt = (
    "You are a helpful medical assistant. Use the context below to answer the question accurately. "
    "If the context does not provide the answer, respond with 'I don't know'.\n\n"
    "Context:\n{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [64]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [65]:
# RAG Chain
rag_chain = (
    {"context": retriever | format_docs, "input": RunnablePassthrough()}
    | prompt
    | llm
)

In [68]:
query = "who is the Governor of jammu and kashmir?"
response = rag_chain.invoke(query)
response.content

'Manoj Sinha is the Lieutenant Governor of Jammu and Kashmir.'

In [69]:
query = "what is acne"
response = rag_chain.invoke(query)
response.content

"I'm sorry, but this document does not contain information about acne."