In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\malah\\Medical_chatbot_openAI'

In [6]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [7]:
#to load the pdf 
def load_pdf_file(data):
    loader = DirectoryLoader(data,
    glob = "*.pdf",
    loader_cls=PyPDFLoader)

    documents = loader.load()
    return documents

In [8]:
extracted_data = load_pdf_file(data='data/')

In [10]:
#extracted_data

In [11]:
#split the data into chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [12]:
text_chunks = text_split(extracted_data)
print("length od the chunks",len(text_chunks))

length od the chunks 7020


In [27]:
from langchain.embeddings import HuggingFaceEmbeddings

In [28]:
#to convert the chunks to embeddings


def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [29]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [30]:
query_result = embeddings.embed_query("hello world")
print("length",len(query_result))

length 384


In [50]:
#initialise Pinecone
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [32]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [51]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [35]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding = embeddings
)

In [36]:
#to load existing index
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)

In [37]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2440f6bfa10>

In [38]:
retriever = docsearch.as_retriever(search_type = "similarity",search_kwars={"k":3})
retriever_docs = retriever.invoke("what is Acne?")

In [39]:
retriever_docs

[Document(id='be42b524-961f-464c-a3ab-1fdaa0fc1754', metadata={'page': 37.0, 'source': 'data\\medical_book.pdf'}, page_content='Acidosis seeRespiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is'),
 Document(id='92575203-6941-4b10-8fcb-06f56c11a10a', metadata={'page': 38.0, 'source': 'data\\medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='035314a7-5a92-491d-a967-6c4428563f2f', metadata={'page': 239.0, 'sou

In [52]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [53]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [54]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [55]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])




Acromegaly and gigantism are medical conditions caused by an excess of growth hormone in the body. Acromegaly occurs in adults and causes enlargement of certain body parts, while gigantism occurs in children and causes excessive growth in height. These conditions are caused by abnormal changes in the body's chemical processes and can result in unusual physical characteristics.


In [56]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])



I don't know the answer to that question.
