In [9]:
import os
os.chdir("../")

In [10]:
!pwd

/Users/ngkuissi/Dev/projects/Medical-Chatbot


In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

In [12]:
load_dotenv()

True

In [13]:
def load_pdf_file(path):
    loader = DirectoryLoader(
        path, glob="medical_book.pdf", loader_cls=PyPDFLoader
    )
    return loader.load()

In [14]:
extracted_data = load_pdf_file("data/")

In [15]:
extracted_data[300].page_content

'right away. Switching to a different antimalarial drug may\nbe an alternative and can allow the side effects to stop.\nAnyone taking antimalarial drugs to prevent malaria\nwho develops a fever or flu-like symptoms while taking\nthe medicine or within two to three months after travel-\ning to an area where malaria is common should call a\nphysician immediately.\nIf the medicine is being taken to treat malaria, and\nsymptoms stay the same or get worse, check with the\nphysician who prescribed the medicine.\nPatients who take this medicine over a long time\nneed to have a physician check them periodically for\nunwanted side effects.\nBabies and children are especially sensitive to the\nantimalarial drug chloroquine. Not only are they more\nlikely to have side effects from the medicine, but they are\nalso at greater risk of being harmed by an overdose. A\nsingle 300-mg tablet could kill a small child. Keep this\nmedicine out of the reach of children. Use safety vials.\nSpecial conditions\

In [16]:
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [17]:
text_chunks = split_text(extracted_data)
print("Length of chunks:", len(text_chunks))

Length of chunks: 5860


In [18]:
def downlaod_hugging_face_embeddings():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

In [19]:
embedding_model = downlaod_hugging_face_embeddings()

In [20]:
query_result = embedding_model.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [21]:
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
pc

<pinecone.control.pinecone.Pinecone at 0x164983640>

In [22]:
index_name = "medical-bot"

In [23]:
pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

### Embed each chunk and insert the embeding into the database

In [24]:
from langchain_pinecone import PineconeVectorStore

In [25]:
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding_model
)

### Load existing indexes from pincone

In [26]:
from langchain_pinecone import PineconeVectorStore

In [27]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model,
)

In [28]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k': 3})

In [29]:
retrived_docs = retriever.invoke("What is Acne?")

In [30]:
retrived_docs

[Document(id='152e56e4-83fd-4c01-b7a1-5bab24f1b172', metadata={'page': 39.0, 'source': 'data/medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='da44a7f0-8f29-402e-ac12-27d74de237c2', metadata={'page': 38.0, 'source': 'data/medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='54eb28e4-5d1f-4415-803a-c9de8dfd9a6f', metadata={'page': 37.0, 'source': 'data/medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when th

In [31]:
from langchain_openai import OpenAI

llm = OpenAI(temperature=0.3, max_tokens=500)

In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [33]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, say that you don't know."
    "keep the answer concise."
    "\n\n"
    "{context}"
)


In [34]:
print(system_prompt)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.If you don't know the answer, say that you don't know.keep the answer concise.

{context}


In [35]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('user', "{input}")
    ]
)

In [36]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [37]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland in the brain, leading to increased growth in bone and soft tissue. This results in a variety of disturbances throughout the body, including unusual height. Acromegaly occurs after bone growth stops, while gigantism occurs during childhood and adolescence. Symptoms of these disorders often occur gradually and can include enlarged hands and feet, facial changes, and other health complications. 


In [38]:
response = rag_chain.invoke({"input": "what is stats?"})
print(response["answer"])



I'm sorry, I don't have enough information to answer that question. Could you provide more context or clarify what you mean by "stats"?
