In [1]:
import os
os.chdir("../")

In [2]:
!pwd

/Users/ngkuissi/Dev/projects/Medical-Chatbot


In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [4]:
load_dotenv()

True

In [5]:
def load_pdf_file(path):
    loader = DirectoryLoader(
        path, glob="medical_book.pdf", loader_cls=PyPDFLoader
    )
    return loader.load()

In [6]:
extracted_data = load_pdf_file("data/")

In [7]:
extracted_data[300].page_content

'diaphragm is effective only if used during every episode\nof sexual intercourse. The same is true for condoms and\nthe cervical cap. Some methods are automatically work-\ning every day, no matter what. These methods include\nDepo Provera, Norplant, the IUD, and tubal sterilization.\nThere are many different ways to use birth control.\nThey can be divided into several groups:\n• By mouth (oral)—Birth control pills must be taken by\nmouth every day.\n• Injected—Depo Provera is a hormonal medication that\nis given by injection every three months.\n• Implanted—Norplant is a long-acting hormonal form\nof birth control that is implanted under the skin of the\nupper arm.\n• Vaginal—Spermicides and barrier methods work in the\nvagina.\n• Intra-uterine—The IUD is inserted into the uterus.\n• Surgical—Tubal sterilization is a form of surgery. A\ndoctor must perform the procedure in a hospital or sur-\ngical clinic. Many women need general anesthesia.\nThe methods of birth control differ from ea

In [8]:
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks = split_text(extracted_data)
print("Length of chunks:", len(text_chunks))

Length of chunks: 7023


In [10]:
def downlaod_hugging_face_embeddings():
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

In [11]:
embedding_model = downlaod_hugging_face_embeddings()

In [12]:
query_result = embedding_model.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])
pc

<pinecone.control.pinecone.Pinecone at 0x1468e8e80>

In [14]:
index_name = "medical-bot"

In [None]:


pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [None]:
index_name

### Embed each chunk and insert the embeding into the database

In [15]:
from langchain_pinecone import PineconeVectorStore

'medical-bot'

In [17]:
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding_model
)

KeyboardInterrupt: 

### Load existing indexes from pincone

In [18]:
from langchain_pinecone import PineconeVectorStore

In [19]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model,
)

In [20]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k': 3})

In [33]:
retrived_docs = retriever.invoke("What is Acne")

In [34]:
retrived_docs

[Document(id='684dfa1f-bcf9-4641-b83e-ae0a88440e86', metadata={'page': 298.0, 'source': 'data/medical_book.pdf'}, page_content='Corticosteroids —A group of anti-inflammatory\nsubstances often used to treat skin conditions.\nImmune response—The protective reaction by the\nimmune system against foreign antigens (sub-\nstances that the body perceives as potentially dan-\ngerous). The immune system combats disease by\nneutralizing or destroying antigens.\ncontact dermatitis becomes a chronic and disabling con-\ndition that can have a profound effect on employability\nand quality of life.\nPrevention'),
 Document(id='e9544312-5d27-4b1c-a34f-442f1ddde5d3', metadata={'page': 297.0, 'source': 'data/medical_book.pdf'}, page_content='ics and personal care products; latex items such as gloves\nand condoms; and formaldehyde. Many people find that\nthey are allergic to the nickel in inexpensive jewelry. ACD\nis usually confined to the area of skin that comes in contact\nwith the allergen, typically

In [23]:
from langchain_openai import OpenAI

llm = OpenAI(temperature=0.3, max_tokens=500)

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [25]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question."
    "If you don't know the answer, say that you don't know."
    "keep the answer concise."
    "\n\n"
    "{context}"
)


In [26]:
print(system_prompt)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.If you don't know the answer, say that you don't know.keep the answer concise.

{context}


In [27]:
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('user', "{input}")
    ]
)

In [28]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [29]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are both conditions caused by an overproduction of growth hormone, typically due to a tumor on the pituitary gland. Acromegaly occurs in adults and causes abnormal growth of the hands, feet, and facial features, while gigantism occurs in children and causes excessive growth in height. Both conditions can lead to serious health complications if left untreated.


In [30]:
response = rag_chain.invoke({"input": "what is stats?"})
print(response["answer"])



Stats is a shortened term for statistics, which are used by doctors to help predict the future course and outcome of a disease and the likelihood of recovery. It is important to note that while statistics can give some important factors, they should not be the only factor considered in making medical decisions.
