## RAG using a pdf book
* see: https://python.langchain.com/docs/use_cases/question_answering/
* using Cohere embeddings
* using a custom prompt

In [1]:
# for pdf post processing
import re

# modified to load from Pdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

# two possible vector store
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

# removed OpenAI, using HF embeddings
from langchain.embeddings import HuggingFaceEmbeddings

from langchain import hub

# removed OpenAI, using OCI GenAI
from oci.config import from_file

# oci_llm is in a local file
from oci_llm import OCIGenAILLM

from langchain.schema.runnable import RunnablePassthrough

# private configs
from config_private import COMPARTMENT_OCID, COHERE_API_KEY

In [2]:
# to enable some debugging
DEBUG = False

#### Template for custom prompt

In [3]:
# this is the template for the prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer don't try to make up an answer. 
Use five sentences maximum. 
Always say "Thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

In [4]:
# functions
def get_answer(rag_chain, question):
    response = rag_chain.invoke(question)

    print(f"Question: {question}")
    print()
    print("The response:")
    print(response)
    print()

In [5]:
# read OCI config to connect to OCI with API key
CONFIG_PROFILE = "DEFAULT"
config = from_file("~/.oci/config", CONFIG_PROFILE)

# OCI GenAI endpoint (for now Chicago)
ENDPOINT = "https://generativeai.aiservice.us-chicago-1.oci.oraclecloud.com"

# check the config to access to api keys
if DEBUG:
    print(config)

#### Loading the document

In [6]:
# BLOG_POST = "https://python.langchain.com/docs/get_started/introduction"
BOOK = "./CurrentEssentialsOfMedicine.pdf"

loader = PyPDFLoader(BOOK)

data = loader.load()

#### Splitting the document in chunks

In [7]:
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)

splits = text_splitter.split_documents(data)

In [8]:
print(f"We have splitted the pdf in {len(splits)} splits...")

We have splitted the pdf in 686 splits...


In [9]:
# some post processing

# replace \n with blank
for split in splits:
    split.page_content = split.page_content.replace("\n", " ")
    split.page_content = re.sub("[^a-zA-Z0-9 \n\.]", " ", split.page_content)

In [10]:
# have a look at a single split
splits[20].page_content

'ReferencePoole Wilson PA  V ok  Z  Kirwan BA  de Brouwer S  Dunselman PH  Lubsen J  ACTION investigators. Clinical course of isolated stable angina due to coronaryheart disease. Eur Heart J 2007 28 1928.  PMID  17562665 '

#### Embeddings and Vectore Store

In [11]:
%%time

EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"

model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}


hf = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL_NAME, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# using Chroma or FAISS as Vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=hf)
# vectorstore = FAISS.from_documents(documents=splits, embedding=hf)

retriever = vectorstore.as_retriever()

CPU times: user 2min 43s, sys: 1min 19s, total: 4min 2s
Wall time: 1min 20s


#### Define the prompt structure

In [12]:
rag_prompt_custom = PromptTemplate.from_template(template)

#### Define the LLM: OCI GenAI

In [13]:
# compartment OCID from config_private.py

# using mostly defaults
llm = OCIGenAILLM(
    temperature=1.0,
    max_tokens=1500,
    config=config,
    compartment_id=COMPARTMENT_OCID,
    endpoint=ENDPOINT,
    debug=DEBUG,
)

#### Define the (Lang)Chain

In [14]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | rag_prompt_custom | llm
)

#### Process the question

In [33]:
# a list of possible questions
QUESTION1 = "What are the suggested treatments for Botulism? Make a list"
QUESTION2 = "List diagnosis for Botulism. Make a list"
QUESTION3 = "List the antibiotics commonly used for Tubercolosis. Make a list"
QUESTION4 = "List the suggested treatments for Sleep Apnea. Make a list."

In [36]:
%%time

# the question
get_answer(rag_chain, question=QUESTION1)

Question: What are the suggested treatments for Botulism? Make a list

The response:
 Here is a list of suggested treatments for Botulism:

1. Removal of unabsorbed toxin from the gut
2. Specific antitoxin
3. Vigilant support, including attention to respiratory function
4. Penicillin for wound botulism
5. Active immunization preventive
6. Passive immunization with tetanus immune globulin and concurrent active immunization for all suspected cases
7. Chlorpromazine or diazepam for spasms or convulsions
8. Vigorous supportive care with particular attention to the airway and laryngospasm
9. Metronidazole
10. Tetanus immune globulin

CPU times: user 68 ms, sys: 35 ms, total: 103 ms
Wall time: 4.49 s


In [32]:
%%time

# the question
get_answer(rag_chain, question=QUESTION2)

Question: List diagnosis for Botulism. Make a list

The response:
 The diagnosis of botulism often requires both clinical and laboratory testing. Here are some common ways to diagnose botulism:

1. History of clinical symptoms: The classic clinical manifestation of botulism consists of symmetric descending paralysis, which may involve the respiratory muscles. Other symptoms may include difficulty speaking or swallowing, dry mouth, blurred vision, and muscle weakness.

2. Toxin detection: Botulinum toxin can be detected in the blood, urine, or stool of a person with botulism. Laboratories can perform specific tests to identify the presence of the toxin.

3.Immunological tests: Your healthcare provider may check your immune system for signs of a botulism infection. This can involve measuring the levels of antibodies in your blood or urine.

4. Imaging tests: Sometimes, imaging tests such as a CT scan or MRI may be used to diagnose botulism, particularly in cases where the disease affects

In [27]:
%%time

# the question
get_answer(rag_chain, question=QUESTION3)

Question: List the antibiotics commonly used for Tubercolosis. Make a list

The response:
 Thanks for asking! Here are some of the most common antibiotics used to treat tuberculosis:
- Isoniazid
- Ethambutol
- Pyrazinamide
- Streptomycin
- Rifampin
- fluoroquinolones

These antibiotics are typically used in combination to treat tuberculosis. It's important to note that some of these antibiotics may cause side effects, such as nausea, vomiting, and liver damage. It's also important to follow the dosage and duration instructions provided by your doctor to ensure the best possible outcome.
If you have any other questions about tuberculosis or its treatment, please don't hesitate to ask.

CPU times: user 64.7 ms, sys: 35.4 ms, total: 100 ms
Wall time: 5.11 s


In [28]:
%%time

# the question
get_answer(rag_chain, question=QUESTION4)

Question: List the suggested treatments for Sleep Apnea. Make a list.

The response:
 The suggested treatments for Sleep Apnea are:
- Weight loss
- Avoid hypnotic medications
- Nocturnal continuous positive airway pressure (CPAP)
- Supplemental oxygen
- Oral appliances
- Modafinil
- Uvulopalatopharyngoplasty (UPPP)
- Nasal septoplasty
- Tracheostomy

CPU times: user 66.2 ms, sys: 36.1 ms, total: 102 ms
Wall time: 2.55 s


#### Explore the vectore store

In [21]:
# Retrieve relevant splits for any question using similarity search.

# This is simply "top K" retrieval where we select documents based on embedding similarity to the query.

TOP_K = 5

docs = vectorstore.similarity_search(QUESTION3, k=TOP_K)

len(docs)

5

In [22]:
for i, doc in enumerate(docs):
    print(f"chunk n. {i+1}")
    print(doc.page_content)
    print()

chunk n. 1
60 Current Essentials of Medicine 2Pulmonary Tuberculosis  Essentials of Diagnosis  Lassitude  weight loss  fever  cough  night sweats  hemoptysis  Cachexia in many  posttussive apical rales occasionally present  Apical or subapical in ltrates with cavities classic in reactivationtuberculosis  pleural effusion in primary tuberculosis  likewisemid lung in ltration  but any radiographic abnormality possible  Positive skin test to intradermal puri ed protein derivative  PPD    Interferon gamma release assays have good speci city for latenttuberculosis  Mycobacterium tuberculosis by culture of sputum  gastric wash  ing  or pleural biopsy  pleural  uid culture usually sterile  Nucleic acid ampli cation can rapidly distinguish between M. tuberculosis and nontuberculous mycobacterium to guide treat  ment decisions but culture still needed for susceptibility testing  Increasingly encountered antibiotic resistant strains  Granuloma on pleural biopsy in patients with effusions  mesoth