### Setting up working directory

In [2]:
print("OK")

OK


In [3]:
%pwd

'/Users/munishpatel/AI-Medical-Assistant/research'

In [7]:
import os
os.chdir("../")

### Loading and Extracting data from the pdf

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
#Extract the data from the source (pdf) documents
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob = "*.pdf", loader_cls = PyPDFLoader)
    #Here *.pdf means only loading pdf's
    documents = loader.load()
    return documents


In [8]:
extracted_data = load_pdf_file(data = 'Data/')

In [9]:
# extracted_data
print("Length of PDF pages:", len(extracted_data))

Length of PDF pages: 4505


### Performing Chunking operation

In [10]:
#Splitting the data into small text chunks
def text_split(extracted_data):
    test_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = test_splitter.split_documents(extracted_data)
    return text_chunks

In [11]:
text_chunks = text_split(extracted_data)
print("The length of the text chunks is:", len(text_chunks))

The length of the text chunks is: 40000


In [13]:
# text_chunks

### Importing Embedding model

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings

def get_embedding_model():
    embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embedding_model

In [15]:
embedding_model = get_embedding_model()

  embedding_model = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [16]:
query_result = embedding_model.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [18]:
# query_result

### Connecting Pinecone and OpenAI

In [34]:
# Loading environment variables
from dotenv import load_dotenv
load_dotenv()

True

In [35]:
# Loading the API keys
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [36]:
# Creating an Index in Pinecone
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-5dbkhxm.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [None]:
# Setting environment variables
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [37]:
# All text chunks are vector embedded and inserted into Pinecone Index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embedding_model, 
)

In [38]:
# Loading the index data from Pinecone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
)

In [40]:
# Creating a retriever from the document search
docsearch
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [41]:
retrieved_docs = retriever.invoke("What is the symptom of diabetes?")
retrieved_docs

[Document(id='777c9467-00b6-451d-a6ec-1b59591f33ff', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 1185.0, 'page_label': '1156', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data/Medical Book.pdf', 'total_pages': 4505.0}, page_content='that a person may not know that he or she has it. Early\nsigns are lethargy, extreme thirst, and frequent urina-\ntion. Other symptoms may include sudden weight loss,\nslow wound healing, urinary tract infections, gum dis-\nease, or blurred vision. It is not unusual for Type II\ndiabetes to be detected while a patient is seeing a doctor\nabout another health concern that is actually being\ncaused by the yet undiagnosed diabetes.\nIndividuals who are at high risk of developing'),
 Document(id='c765a2d9-1305-46f1-8550-8070a9a8e9dc', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page

In [42]:
# Creating a OpenAI model instance
from langchain_openai import OpenAI

llm = OpenAI(temperature=0.4, max_tokens=500)

In [51]:
# 
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are a medical assistant that can ONLY answer questions based on the specific medical document "
    "that has been loaded into your knowledge base. Use ONLY the following pieces of retrieved context "
    "to answer the question. If the question cannot be answered using the specific information found "
    "in the retrieved context, respond with 'I cannot answer this question as this information is not "
    "present in the knowledge base.' Do not use any external knowledge. Keep answers concise "
    "and use three sentences maximum."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [48]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [49]:
response = rag_chain.invoke({"input": "What is the symptom of diabetes?"})
print(response["answer"])



Symptoms of diabetes include frequent urination, lethargy, excessive thirst, and hunger. These symptoms can persist for long periods of time and may change over time within the same person. They can also be influenced by physical or mental activities, physical or mental state, the amount of time passed since the last meal, the amount and quality of sleep, and exercise patterns.


In [52]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])


I cannot answer this question as this information is not present in the loaded medical document.
