In [6]:
import os
os.chdir("../")

In [7]:
%pwd

'c:\\Users\\odz-2\\Desktop\\learning\\medical-AI-chatbot\\Medical-Chatbot-Example'

In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
#Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [9]:
extracted_data = load_pdf_files("data")

In [10]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content = doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [11]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [12]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, #500 tokens equal to one chunk
        chunk_overlap=20, #understand the context with this overlap
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [13]:
texts_chunks = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunks)}")

Number of chunks: 5859


In [14]:
#Embedding model: HuggingFaceEmbeddings -> SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():

    model_name="all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()


In [15]:
vector = embedding.embed_query("Hello World")


In [16]:
print("Vector length: ", len(vector))

Vector length:  384


In [17]:
from dotenv import load_dotenv #importing the load_dotenv function
import os
load_dotenv()


True

In [18]:
#Groq version API Keys
PINECONE_API_KEY= os.getenv("PINECONE_API_KEY")
GROQ_API_KEY= os.getenv("GROQ_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY #saving it as an environment variable
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [19]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [27]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

index = pc.Index(index_name)

In [28]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    embedding=embedding,
    index_name=index_name
)

#it will take all the text chunks and will use the embedding model
# to convert them into vector datatbase
# to store them in Pinecone vector database 

In [29]:
from langchain_pinecone import PineconeVectorStore
#Embed each chunk and upsert ther embeddings into your Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name
)

In [30]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
#creting the retriever to bring the top 3 most relevant documents

In [31]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs
#it brings three most relevant documents

[Document(id='3ec08768-53d3-4bea-872a-763638c50e04', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='532c3e93-aa45-42dd-bf8d-92cfc873577d', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='234e9429-c19c-4988-a280-fc2a5e2ae781', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26')]

In [38]:
from langchain_groq import ChatGroq

chatModel = ChatGroq(model="llama-3.1-8b-instant")


In [39]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [40]:
#prompt template
system_prompt = (
    "You are a medical assitant for question-answering tasks."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

#creating the prompt template and giving the system and user roles
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
    ]

)

In [41]:
#creating a chain to combine the documents using 'stuff' method
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [42]:
#we can ask any kind of question
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue, and various other disturbances throughout the body. This disorder can also result in a condition known as gigantism if it occurs before growth plates close.


In [43]:
response = rag_chain.invoke({"input": "What is the treatment for Acne?"})
print(response["answer"])

The treatment for acne depends upon whether it is mild, moderate, or severe. For mild non-inflammatory acne, treatment consists of topical drugs such as tretinoin, benzoyl peroxide, adapalene, or salicylic acid to reduce comedones. Improvement is usually seen in two to four weeks.
