In [2]:
from langchain.document_loaders import PyPDFLoader , DirectoryLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.chdir("D:\Artifitial_intelligence\GEN_AI\Projects\Medical-ChatBot-APP")

In [4]:
%pwd

'D:\\Artifitial_intelligence\\GEN_AI\\Projects\\Medical-ChatBot-APP'

In [5]:
# Extract text from PDF Files

def load_pdf_files (dataPath) : 
    loader = DirectoryLoader(dataPath , glob="*.pdf" , loader_cls=PyPDFLoader)
    docs = loader.load()
    return docs

extractedData = load_pdf_files("data")

In [6]:
len(extractedData)

1428

In [7]:
from typing import List 
from langchain.schema import Document 

def filter_Documents_metadata (docs:List[Document]) -> List[Document]:
    """
    Given a List of Documents objects , 
    return a new list of Document objects containing only 
    "source" in metadata and the original page_content
    """

    filtered_docs = []

    for doc in docs : 
        src = doc.metadata.get("source")
        filtered_docs.append(
            Document(
                page_content = doc.page_content ,
                metadata = {"source" :src}
            )
        )

    return filtered_docs

filtered_docs = filter_Documents_metadata(extractedData)

In [8]:
def text_split (filtered_docs) : 
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 1000 , 
        chunk_overlap = 200 , 
    )

    text_chuncks = text_splitter.split_documents(filtered_docs)
    return text_chuncks 

text_chunks = text_split(filtered_docs)
print("Num of Text Chunks : ",len(text_chunks))

Num of Text Chunks :  11093


In [9]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
import torch
def downloading_embeddings() : 
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceBgeEmbeddings(
        model_name = model_name , 
        model_kwargs = {"device" : "cuda" if torch.cuda.is_available() else "cpu"}
    )
    return embeddings

embeddings = downloading_embeddings()

  embeddings = HuggingFaceBgeEmbeddings(


In [10]:
print(embeddings.embed_query('hello world') )
print(len(embeddings.embed_query('hello world')))

[-0.010300828143954277, 0.18307934701442719, 0.030811281874775887, 0.004452868364751339, -0.027336159721016884, -0.0335625596344471, 0.03763158619403839, -0.03157338127493858, -0.0033909634221345186, -0.008950830437242985, 0.03803609684109688, -0.051291048526763916, 0.00036826470750384033, -0.02372710593044758, 0.09271029382944107, -0.02779584378004074, -0.03515247628092766, -0.0032241714652627707, -0.0768178328871727, -0.057612095028162, 0.07257598638534546, 0.11128553748130798, 0.01605852320790291, 0.01590849459171295, -0.0823269784450531, 0.007007323205471039, 0.029013115912675858, 0.0011386671103537083, 0.11671741306781769, -0.032327406108379364, -0.03227165341377258, -0.0012590623227879405, 0.10591620206832886, 0.023600773885846138, 0.00966486893594265, 0.09834077209234238, 0.04293639212846756, -0.019547687843441963, 0.01926792599260807, -0.06417104601860046, 0.023923387750983238, -0.05287997052073479, -0.026469580829143524, 0.005548726301640272, -0.017025155946612358, -0.03023269

In [11]:
from dotenv import load_dotenv
load_dotenv()

pineConeApi = os.getenv("pineConeApi_1")
openaiApiKey = os.getenv("openApiKey")
base_url = os.getenv("openApiBaseUrl")

In [12]:
os.environ["PINECONE_API_KEY"] = pineConeApi
os.environ["PINECONE_ENV"] = "us-east-1" 
os.environ['OPEN_API_KEY'] = openaiApiKey

In [13]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=pineConeApi)
index_name = "medical-chatbot"

if not pc.has_index(index_name) : 
    pc.create_index(
        name=index_name , 
        dimension=384 , 
        metric="cosine" , 
        spec=ServerlessSpec(cloud="aws" , region="us-east-1")
    )

index = pc.Index(index_name)

In [14]:
from langchain_pinecone import PineconeVectorStore

docSearch = PineconeVectorStore.from_documents(
    documents=text_chunks , 
    embedding=embeddings , 
    index_name = index_name 
)

KeyboardInterrupt: 

In [None]:
# if you Already Stored the data and want to load it
docSearch = PineconeVectorStore.from_existing_index(
    embedding=embeddings , 
    index_name = index_name 
)

In [17]:
retriever = docSearch.as_retriever(search_type = "similarity" , search_kwargs = {"k" : 5})

In [18]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI (
    model="openai/gpt-oss-20b:free",
    base_url=base_url,
    api_key=openaiApiKey
)

In [19]:
from langchain.chains import create_retrieval_chain 
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [20]:
sys_prompt = (
    "You are an Medical Assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer"
    "the question . If you don't know the answer , Say that you "
    "don't know . Use 3 sentences maximum and keep the answer concise"
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system" , sys_prompt) , 
        ("human" , "{input}")
    ]
)

In [21]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever , question_answer_chain)

In [22]:
response = rag_chain.invoke({"input" : "Why my heart is work quickly ?"})
print(response['answer'])

A fast heartbeat (tachycardia) can be caused by anxiety, stimulants (like alcohol or nicotine), or a hyperdynamic state such as anemia, pregnancy, or hyperthyroidism. It may also arise from valve disease (e.g., aortic regurgitation) or suddenâ€‘onset arrhythmias such as supraventricular tachycardia. If it occurs frequently, feels uncomfortable, or you have other symptoms, see a doctor for evaluation.
