In [1]:
%pwd

'd:\\Medical_ChaTBoT\\research'

In [2]:
import os
os.chdir("../")


In [3]:
%pwd

'd:\\Medical_ChaTBoT'

In [4]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
def pdf_loader(data_dir):
    loader = DirectoryLoader(data_dir,glob="*.pdf",loader_cls=PyPDFLoader)
    doc = loader.load()
    return doc

In [6]:
extracted_data = pdf_loader("Data/")

In [7]:
def text_split(extreacted_text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap = 20)
    chunks = text_splitter.split_documents(extreacted_text)
    return chunks

In [8]:

text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [9]:

from langchain.embeddings import HuggingFaceEmbeddings

In [10]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [12]:

embeddings = download_hugging_face_embeddings()

In [14]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:

PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [18]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [22]:
index_name = "medibot"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medibot",
    "metric": "cosine",
    "host": "medibot-6wptku0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [23]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
# when it is converted to vector database it will look for this

In [24]:
# embed each chunk and insert the embedding into the pinecone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents= text_chunks,
    index_name = index_name,
    embedding=embeddings
)

In [25]:
#load existing index
doc_search = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [26]:
doc_search

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x28f84b56680>

In [27]:
retriver = doc_search.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [30]:
retrived_docs = retriver.invoke("What is Acne?")

In [31]:
retrived_docs

[Document(id='1051c134-2678-4a3f-83b5-b46eee0d0205', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='56918c52-328c-47ae-b89f-1c1b9065fe17', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [36]:
from langchain_openai import ChatOpenAI
model = ChatOpenAI(model='gpt-4')



In [34]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

sys_promp = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


# Create the chat prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", sys_promp),
    ("human", "{input}"),
])

In [37]:
qna_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriver,qna_chain)

In [38]:
response = rag_chain.invoke({"input":"What is Acne ?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the skin's pores become clogged with oil, dead skin cells, and bacteria. The medical term for common acne is Acne vulgaris and it is the most widespread skin disease in the United States.


In [39]:
response = rag_chain.invoke({"input":"What is Machine learning ?"})

In [40]:
print(response["answer"])

The provided context does not contain any information on machine learning.
