In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'c:\\Users\\odz-2\\Desktop\\learning\\medical-AI-chatbot\\Medical-Chatbot-Example'

In [3]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf_files("data")

In [6]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [7]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [8]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, #500 tokens equal to one chunk
        chunk_overlap=20, #understand the context with this overlap
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [9]:
texts_chunks = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunks)}")

Number of chunks: 5859


In [10]:
#Embedding model: HuggingFaceEmbeddings -> SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():

    model_name="all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()


In [11]:
vector = embedding.embed_query("Hello World")


In [12]:
print("Vector length: ", len(vector))

Vector length:  384


In [13]:
from dotenv import load_dotenv #importing the load_dotenv function
import os
load_dotenv()


True

In [14]:
#Groq version API Keys
PINECONE_API_KEY= os.getenv("PINECONE_API_KEY")
GROQ_API_KEY= os.getenv("GROQ_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY #saving it as an environment variable
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [15]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [17]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

index = pc.Index(index_name)

In [18]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    embedding=embedding,
    index_name=index_name
)

#it will take all the text chunks and will use the embedding model
# to convert them into vector datatbase
# to store them in Pinecone vector database 

In [19]:
from langchain_pinecone import PineconeVectorStore
#Embed each chunk and upsert ther embeddings into your Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name
)

In [20]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
#creting the retriever to bring the top 3 most relevant documents

In [21]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs
#it brings three most relevant documents

[Document(id='59656227-40e7-4e2b-a569-c93199c81c25', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='aa9b5c90-4b5c-4b30-94ac-1c03a23f9845', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='54287b06-9e36-44df-8219-6cca65fb08f0', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25')]

In [22]:
from langchain_groq import ChatGroq

chatModel = ChatGroq(model="llama-3.1-8b-instant")


In [23]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [24]:
#prompt template
system_prompt = (
    "You are a medical assitant for question-answering tasks."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

#creating the prompt template and giving the system and user roles
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
    ]

)

In [25]:
#creating a chain to combine the documents using 'stuff' method
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [27]:
#we can ask any kind of question
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])
print(response["context"])


Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue, as well as various body disturbances. This results from an excess of growth hormone production. Gigantism is a similar condition that occurs in children before the bones have stopped growing.
[Document(id='38402036-b092-4d57-9a05-1de5d598e786', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Whitehouse Station, NJ: Merck Research Laboratories,\n1997.\nLarsen, D. E., ed. Mayo Clinic Family Health Book.New York:\nWilliam Morrow and Co., Inc., 1996.\nJohn T. Lohr, PhD\nAcromegaly and gigantism\nDefinition\nAcromegaly is a disorder in which the abnormal\nrelease of a particular chemical from the pituitary gland\nin the brain causes increased growth in bone and soft tis-\nsue, as well as a variety of other disturbances throughout\nthe body. This chemical released from the pituitary gland'), Document(id='408761ef-c24d-4926-9171-797d

In [None]:
response = rag_chain.invoke({"input": "What is the treatment for Acne?"})
print(response["answer"])
#print(response["context"])