In [8]:
%pwd


'c:\\Users\\mssah\\OneDrive\\Desktop\\Medical-chatbot-Generative-AI\\research'

In [9]:
import os
os.chdir("../")

In [10]:
%pwd

'c:\\Users\\mssah\\OneDrive\\Desktop\\Medical-chatbot-Generative-AI'

In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
# Extract Data from the PDF file

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls = PyPDFLoader)
    documents = loader.load()

    return documents

In [13]:
extracted_data = load_pdf_file(data = 'Data/')

In [15]:
#extracted_data

In [16]:
# Split data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter (chunk_size =500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [17]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks:", len(text_chunks))

Length of Text Chunks: 5859


In [19]:
#text_chunks

In [29]:
from langchain.embeddings import HuggingFaceEmbeddings

In [30]:
# Download the Embeddings from HuggingFace
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [31]:
embeddings = download_hugging_face_embeddings()

In [32]:
query_result = embeddings.embed_query("hello world")
print("length", len(query_result))

length 384


In [33]:
#query_result

In [48]:
from dotenv import load_dotenv
load_dotenv()

True

In [49]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [36]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = "medicalbot"

pc.create_index(
     name = index_name,
     dimension = 384,
     metric = "cosine",
     spec = ServerlessSpec(
        cloud= "aws",
        region = "us-east-1"
     )

)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-aus3ome.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [50]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [38]:
#Embed each chunk and upsert the embeddings inti your pineocne index.

from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings,
)

In [39]:
# Load existing Index

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
     index_name = index_name,
     embedding = embeddings,
)

In [40]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x17be385a330>

In [41]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [42]:
retrieved_docs = retriever.invoke("what is acne")

In [43]:
retrieved_docs

[Document(id='69205d39-b6b6-4ff0-94db-d85e3828fcb3', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medicine_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='e4171380-3232-492a-9a6c-943f7cdd1164', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medicine_book.pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacte

In [51]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)


In [52]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt=(
    "You are an assistant for question-answering tasks."
    "Use the following peices of retrieved context to answer"
    "the question. If you don't know the answer, say that you"
    "don't know. Use three sentences maximum and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [53]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [55]:
response = rag_chain.invoke({"input": "what is acromegaly and gigantism?"})
print(response["answer"])



Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland in the brain, leading to increased growth in bones and soft tissue. This can result in a variety of disturbances throughout the body, including unusual height. Acromegaly is when this abnormality occurs after bone growth has stopped, while gigantism occurs before bone growth stops. It is a rare disorder that is often not diagnosed until middle age due to the gradual onset of symptoms.
