In [18]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers


In [8]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [None]:
extracted_data = load_pdf("../data/")

In [6]:
extracted_data[0]

Document(page_content='', metadata={'source': '..\\data\\Medical_book.pdf', 'page': 0})

In [11]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [12]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [14]:
len(text_chunks)

7020

In [7]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = download_hugging_face_embeddings()



In [27]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [28]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [19]:
#Initializing the Pinecone
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)
model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [20]:
text_chunks[0].page_content

'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'

In [21]:
#Creating Embeddings for Each of The Text Chunks & storing
vectors = []
for i, embedding in enumerate(text_chunks):    
    vectors.append({
        "id": f"doc_{i}",
        "values": model.encode(embedding.page_content),
        "metadata": {"text": embedding.page_content}
    })

In [3]:
from pinecone import Pinecone

PINECONE_API_KEY = "f67033c1-66b9-44e0-bb81-edf716168e9e"

pc = Pinecone(api_key=PINECONE_API_KEY)

  from tqdm.autonotebook import tqdm


In [15]:
from langchain.vectorstores import Pinecone

index_name="medical-chatbot"

# connect to index
index = pc.Index(index_name)

text_field = "text"

vectorstore = Pinecone(
    index=index, 
    embedding=embeddings, 
    text_key=text_field)

In [16]:
vectorstore.similarity_search(
    query="What is Rhintis?",  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 111Allergic rhinitisGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 111'),
 Document(page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 113Allergic rhinitisGEM - 0001 to 0432 - A  10/22/03 1:42 PM  Page 113'),
 Document(page_content='sensitization to Rh disease.')]

In [None]:
len(vectors)
#index.upsert([{"id":"vec1", "values":vectors[0]['values'], "metadata": vectors[0]['metadata']}])
#vectors[0]['values']

In [74]:
# Upsert documents into the Pinecone index
for vec in vectors:
    index.upsert([vec])

In [20]:
query = "What are Allergies"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(vector=xq, top_k=5, include_metadata=True)
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.81: Description
Allergies are among the most common of medical
0.77: An allergy is a type of immune reaction. Normally,
0.75: Allergen —A substance that provokes an allergic
response.
Allergic rhinitis —Inflammation of the mucous
membranes of the nose and eyes in response to anallergen.
Anaphylaxis —Increased sensitivity caused by previ-
0.73: known as allergy, and the offending substanceis called an allergen.
0.72: KEY TERMS
Allergen —A substance that provokes an allergic
response.
Anaphylaxis —Increased sensitivity caused by pre-


In [21]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [22]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [23]:
llm=CTransformers(model="../model/llama-2-7b-chat.ggmlv3.q8_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [24]:
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=vectorstore.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [25]:
response = qa({"query": "what are Allergies?"})

  warn_deprecated(


In [28]:
print(response['result'])

Allergies are an abnormal response of the immune system to a typically harmless substance, such as pollen, dust mites, or certain foods. When an allergen enters the body, the immune system mistakenly identifies it as harmful and tries to fight it off, leading to uncomfortable symptoms like sneezing, congestion, runny nose, itchy eyes, and skin rashes.


In [29]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Response :  There is no known "cure" for allergies, but there are various treatments available to help manage symptoms and prevent reactions. These may include medications such as antihistamines, decongestants, and corticosteroids, as well as immunotherapy (allergy shots) and sublingual immunotherapy (SLIT). It is important to work with a healthcare provider to determine the best treatment plan for each individual case.
Response :  Allergies are abnormal reactions of the immune system to substances that normally do not harm a person. These substances, called allergens, can cause an allergic reaction when they enter the body. Allergies can affect different parts of the body, including the nose, eyes, skin, and digestive system. Common allergy symptoms include sneezing, runny nose, itchy eyes, hives, and stomach problems. There are several types of allergies, including seasonal allergies (such as hay fever), food allergies, and insect stings. Allergies can be diagnosed by a doctor throug