In [64]:
import os
import pinecone
from dotenv import load_dotenv
from langchain.vectorstores import Pinecone
from langchain.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings

In [52]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

In [8]:
# Extract data from PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [10]:
extracted_data = load_pdf("data/")

In [17]:
print(extracted_data[110].page_content)

• headaches
• vaginal infection,itching, or irritation
• increased blood pressure
Women who have any of the following symptoms
should get emergency help right away. These symptoms
may be signs of blood clots:
• sudden changes in vision, speech, breathing, or coordi-
nation
• severe or sudden headache
• coughing up blood
• sudden, severe, or continuing pain in the abdomen or
stomach
• pain in the chest, groin, or leg (especially in the calf)
• weakness, numbness, or pain in an arm or leg
Oral contraceptives may continue to affect the men-
strual cycle for some time after a woman stops taking
them. Women who miss periods for several months after
stopping this medicine should check with their physicians.
Other rare side effects may occur. Anyone who has
unusual symptoms while taking oral contraceptives
should get in touch with her physician.
Interactions
Oral contraceptives may interact with a number of
other medicines. When this happens, the effects of one or
both of the drugs may change

In [18]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [19]:
text_chunks = text_split(extracted_data)
print(f"Length of chunks : {len(text_chunks)}")

Length of chunks : 8846


In [23]:
print(text_chunks[20].page_content)

meditation to aromatherapy. In addition to full essays on
alternative therapies, the encyclopedia features specific
Alternative treatmentsections for diseases and condi-
tions that may be helped by complementary therapies.
INCLUSION CRITERIA
A preliminary list of diseases, disorders, tests and treat-
ments was compiled from a wide variety of sources,
including professional medical guides and textbooks as
well as consumer guides and encyclopedias. The general


In [27]:
# Download Embedding Model
def download_hugging_face_embedding():
    embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", show_progress=True)
    return embedding

In [28]:
embeddings = download_hugging_face_embedding()

In [34]:
result = embeddings.embed_query("Parth Dobariya")
print(f"Length : {len(result)}")

Batches: 100%|██████████| 1/1 [00:00<00:00, 36.13it/s]

Length : 384





In [35]:
# Initializing Pinecone
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-chatbot"

# # Create Embeddings for each of the Text Chunk
docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embedding=embeddings, index_name=index_name)

Batches: 100%|██████████| 32/32 [00:32<00:00,  1.03s/it]
Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]
Batches: 100%|██████████| 32/32 [00:32<00:00,  1.02s/it]
Batches: 100%|██████████| 32/32 [00:34<00:00,  1.07s/it]
Batches: 100%|██████████| 32/32 [00:31<00:00,  1.03it/s]
Batches: 100%|██████████| 32/32 [00:30<00:00,  1.07it/s]
Batches: 100%|██████████| 32/32 [00:33<00:00,  1.06s/it]
Batches: 100%|██████████| 32/32 [00:34<00:00,  1.06s/it]
Batches: 100%|██████████| 27/27 [00:20<00:00,  1.32it/s]


In [43]:
# If already have an index, load it
docsearch = Pinecone.from_existing_index(index_name, embeddings)

query = "What is Jaundice?"

docs = docsearch.similarity_search(query, k=2)

print("Result : ", docs)

Batches: 100%|██████████| 1/1 [00:00<00:00, 32.79it/s]


Result :  [Document(metadata={}, page_content='Jaundice —Yellow discoloration of skin and whites\nof the eyes that results from excess bilirubin in the\nbody’s system.\nNecrotizing enterocolitis (NEC) —A condition in\nwhich part of the intestines are destroyed as a result\nof bacterial infection.\nRespiratory distress syndrome (RDS)—Condition in\nwhich a premature infant with immature lungs does\nnot develop surfacant, a protective film that helps\nair sacs in the lungs to stay open. The most common\nproblem seen in premature infants.'), Document(metadata={}, page_content='light to treat his jaundice. (Photograph by Ron Sutherland.\nPhoto Researchers, Inc. Reproduced by permission.)')]


In [None]:
# Not formatted result. So, use LLM model
for i in docs:
    print(i.page_content)
    print("-"*80)

Jaundice —Yellow discoloration of skin and whites
of the eyes that results from excess bilirubin in the
body’s system.
Necrotizing enterocolitis (NEC) —A condition in
which part of the intestines are destroyed as a result
of bacterial infection.
Respiratory distress syndrome (RDS)—Condition in
which a premature infant with immature lungs does
not develop surfacant, a protective film that helps
air sacs in the lungs to stay open. The most common
problem seen in premature infants.
--------------------------------------------------------------------------------
light to treat his jaundice. (Photograph by Ron Sutherland.
Photo Researchers, Inc. Reproduced by permission.)
--------------------------------------------------------------------------------


In [65]:
system_prompt = (
    "Use the following pieces of information to answer to user's question."
    "If you don't know the answer, just say that you don't know, don't try to make up an answer."
    
    "Context: {context}"
)

PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [66]:
llm = ChatNVIDIA(
    model="meta/llama-3.1-8b-instruct",
    api_key = NVIDIA_API_KEY,
    temperature=0.2,
    top_p=0.7,
    max_tokens=1024
)

In [67]:
question_answer_chain = create_stuff_documents_chain(llm=llm, prompt=PROMPT)
qa_chain = create_retrieval_chain(
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    combine_docs_chain=question_answer_chain
)

In [68]:
# user_input = input(f"Input prompt")
result = qa_chain.invoke({"input": query})

Batches: 100%|██████████| 1/1 [00:00<00:00, 65.58it/s]


In [70]:
result["answer"]

"Jaundice is a yellow discoloration of the skin and whites of the eyes that results from excess bilirubin in the body's system."

In [78]:
while True:
    import sys
    user_query = input(f"Input prompt")
    if user_query == "exit" or user_query == "":
        break
    else:
        result = qa_chain.invoke({"input": user_query})
        print("Response : ", result["answer"])