## RAG using a pdf book
* see: https://python.langchain.com/docs/use_cases/question_answering/
* using HF embeddings
* using a custom prompt
* Using a Medicine book as knowledge base

In [1]:
# for pdf post processing
import re

# modified to load from Pdf
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate

# two possible vector store
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

# removed OpenAI, using HF embeddings
from langchain.embeddings import HuggingFaceEmbeddings

from langchain import hub

# removed OpenAI, using OCI GenAI
from oci.config import from_file

# oci_llm is in a local file
from oci_llm import OCIGenAILLM

from langchain.schema.runnable import RunnablePassthrough

# private configs
from config_private import COMPARTMENT_OCID

In [2]:
# to enable some debugging
DEBUG = False

#### Template for custom prompt

In [3]:
# this is the template for the prompt
template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer don't try to make up an answer. 
Use five sentences maximum. 
Always say "Thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""

In [4]:
# functions
def get_answer(rag_chain, question):
    response = rag_chain.invoke(question)

    print(f"Question: {question}")
    print()
    print("The response:")
    print(response)
    print()

In [5]:
# read OCI config to connect to OCI with API key
CONFIG_PROFILE = "DEFAULT"
config = from_file("~/.oci/config", CONFIG_PROFILE)

# OCI GenAI endpoint (for now Chicago)
ENDPOINT = "https://generativeai.aiservice.us-chicago-1.oci.oraclecloud.com"

# check the config to access to api keys
if DEBUG:
    print(config)

#### Loading the document

In [11]:
# BLOG_POST = "https://python.langchain.com/docs/get_started/introduction"
BOOK = "./CurrentEssentialsOfMedicine.pdf"

loader = PyPDFLoader(BOOK)

data = loader.load()

#### Splitting the document in chunks

In [12]:
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)

splits = text_splitter.split_documents(data)

In [13]:
print(f"We have splitted the pdf in {len(splits)} splits...")

We have splitted the pdf in 1320 splits...


In [14]:
# some post processing

# replace \n with blank
for split in splits:
    split.page_content = split.page_content.replace("\n", " ")
    split.page_content = re.sub("[^a-zA-Z0-9 \n\.]", " ", split.page_content)
    # remove duplicate blank
    split.page_content = " ".join(split.page_content.split())

In [15]:
# have a look at a single split
splits[20].page_content

'Preface The fourth edition of Current Essentials of Medicine originally titled Essentials of Diagnosis Treatment continues a feature introduced inthe second edition a Clinical Pearl for each diagnosis. Pearls are time less. Learners at every level and in many countries remember them ascrucial adjuncts to more detailed information about disorders of everytype. Ideally a Pearl is succinct witty and often colloquial it is statedwith a certitude suggesting 100 accuracy. Of course nothing in med icine is so yet a Pearl such as If you diagnose multiple sclerosis overthe age of fty diagnose something else is easily committed to memory.Thus Pearls should be accepted as offered. Many have been changedsince the previous editions and we urge readers to come up with Pearlsof their own which may prove to be more useful than our own. The fourth edition like its predecessors uses a single page to con'

#### Embeddings and Vectore Store

In [16]:
%%time

EMBED_MODEL_NAME = "BAAI/bge-base-en-v1.5"

model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": False}


hf = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL_NAME, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

# using Chroma or FAISS as Vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=hf)
# vectorstore = FAISS.from_documents(documents=splits, embedding=hf)

# increased num. of docs to 8 (default to 4)
retriever = vectorstore.as_retriever(search_kwargs={"k":8})

CPU times: user 2min 45s, sys: 48.8 s, total: 3min 34s
Wall time: 1min 17s


#### Define the prompt structure

In [17]:
rag_prompt_custom = PromptTemplate.from_template(template)

#### Define the LLM: OCI GenAI

In [18]:
# compartment OCID from config_private.py

# using mostly defaults
llm = OCIGenAILLM(
    temperature=1.0,
    max_tokens=1500,
    config=config,
    compartment_id=COMPARTMENT_OCID,
    endpoint=ENDPOINT,
    debug=DEBUG,
)

#### Define the (Lang)Chain

In [19]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()} | rag_prompt_custom | llm
)

#### Process the question

In [20]:
# a list of possible questions
QUESTION1 = "What are the suggested treatments for Botulism? Make a list"
QUESTION2 = "List diagnosis for Botulism. Make a list"
QUESTION3 = "List the antibiotics commonly used for Tubercolosis. Make a list"
QUESTION4 = "List the suggested treatments for Sleep Apnea. Make a list."

In [22]:
%%time

# the question
get_answer(rag_chain, question=QUESTION1)

Question: What are the suggested treatments for Botulism? Make a list

The response:
 1. Speci c antitoxin
2. Vigilant support including attention to respiratory function
3. Penicillin for wound botulism
4. Regional outbreaks among intravenous drug users suggest black tar heroin is being sold in the area.

CPU times: user 63.5 ms, sys: 28.1 ms, total: 91.6 ms
Wall time: 3.22 s


In [24]:
%%time

# the question
get_answer(rag_chain, question=QUESTION2)

Question: List diagnosis for Botulism. Make a list

The response:
 Some of the possible diagnosis for botulism are:
1. Clostridium botulinum
2. Bulbar poliomyelitis
3. Myasthenia gravis
4. Posterior cerebral circulation ischemia
5. Tick paralysis
6. Guillain Barr syndrome
7. variant
8. Inorganic phosphorus poisoning
9. Pearl Regional outbreaks

CPU times: user 62.1 ms, sys: 26.3 ms, total: 88.4 ms
Wall time: 3.82 s


In [25]:
%%time

# the question
get_answer(rag_chain, question=QUESTION3)

Question: List the antibiotics commonly used for Tubercolosis. Make a list

The response:
 The common antibiotics used for the treatment of Tuberculosis are:
1. Isoniazid
2. Rifampin
3. Pyrazinamide
4. Streptomycin
5. Ethambutol
6. Uroquinolones
7. Thioamides

Thanks for asking!

CPU times: user 65.9 ms, sys: 31.8 ms, total: 97.7 ms
Wall time: 2.91 s


In [26]:
%%time

# the question
get_answer(rag_chain, question=QUESTION4)

Question: List the suggested treatments for Sleep Apnea. Make a list.

The response:
 1. Weight loss
2. Avoidance of hypnotic medications
3. Nocturnal continuous positive airway pressure (CPAP)
4. Oral appliances

CPU times: user 62 ms, sys: 31.9 ms, total: 93.8 ms
Wall time: 1.53 s


#### Explore the vectore store

In [28]:
# Retrieve relevant splits for any question using similarity search.

# This is simply "top K" retrieval where we select documents based on embedding similarity to the query.

TOP_K = 8

docs = vectorstore.similarity_search(QUESTION3, k=TOP_K)

len(docs)

8

In [29]:
for i, doc in enumerate(docs):
    print(f"chunk n. {i+1}")
    print(doc.page_content)
    print()

chunk n. 1
Increasingly encountered antibiotic resistant strains Granuloma on pleural biopsy in patients with effusions mesothe lial cells usually absent from uid Miliary tuberculosis widespread hematogenous spread of organism has diverse clinical presentations including failure to thrive fever ofunknown origin multiorgan system failure ARDS nearly all haveovert pulmonary involvement with numerous small nodules Differential Diagnosis Lung carcinoma fungal infection Bacterial pneumonia or lung abscess other mycobacterial infections Sarcoidosis pneumoconiosis Treatment Combination antituberculous therapy for 6 9 months all regi mens include isoniazid but rifampin ethambutol pyrazinamide and streptomycin all have activity Avoid empiric treatment for community acquired pneumonia with uoroquinolones if M. tuberculosis is suspected as transient use may facilitate development of resistance All cases of suspected M. tuberculosis infection should be reported

chunk n. 2
Carcinoma of the lung Lu