## Import Libaries

In [1]:
import os 
from pprint import pprint


from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores import Chroma


In [2]:
data_folder = r'/Users/user/Documents/personal_projects/rag_insurance/data'

pdf_files = os.listdir(data_folder)
pprint(pdf_files)
print('There are {} pdf files in the folder'.format(len(pdf_files)))

['ggp-web-brochure.pdf',
 'supreme-health-standard-plan-brochure.pdf',
 'gels-pdt-pd-gsh-gtc-benefit-and-premium-tables-eng.pdf',
 'gsh-gtc-benefit-schedule-and-premium-rates-01012024.pdf',
 'great-careshield-english-brochure.pdf',
 'direct-great-life-2-brochure.pdf',
 'great-hospital-cash.pdf',
 'gels-pdt-pd-gsh-gtc-brochure-eng.pdf',
 'gels-pdt-pd-gsh-gtc-eng-brochure-ns.pdf',
 'great-protector-active-brochure.pdf',
 'gels-pdt-gpa-brochure.pdf',
 'gsh-gtc-brochure.pdf']
There are 12 pdf files in the folder


### Load a pdf in data folder 
- PyPDFLoader only loads a single PDF

In [3]:
# Load documents
loader = PyPDFLoader('/Users/user/Documents/personal_projects/rag_insurance/data/direct-great-life-2-brochure.pdf')
pages = []
async for page in loader.alazy_load():
    pages.append(page)
    
print('There are {} pages in the document'.format(len(pages)))

There are 3 pages in the document


### Read the metadata and content 

In [4]:
pprint(f"{pages[0].metadata}\n")
print('-------------------------')
pprint(pages[0].page_content)

("{'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 "
 "(Macintosh)', 'creationdate': '2021-06-30T15:36:59+08:00', 'moddate': "
 '\'2021-06-30T18:07:53+08:00\', \'subject\': "Great Eastern\'s DIRECT - GREAT '
 'Life II is a whole life insurance against Death, Terminal Illness & '
 'Permanent Disability. Visit Great Eastern Singapore today!", \'title\': '
 "'DIRECT Great Life II Brochure | Life Insurance | Great Eastern Singapore', "
 "'trapped': '/false', 'source': "
 "'/Users/user/Documents/personal_projects/rag_insurance/data/direct-great-life-2-brochure.pdf', "
 "'total_pages': 3, 'page': 0, 'page_label': '1'}\n")
-------------------------
('Life Protection\n'
 'DIRECT – GREAT LIFE II\n'
 'Lifetime coverage at affordable premiums')


### Load all the PDF
- PyPDFDirectoryLoader loads all the PDF in a directory

In [5]:
loader = PyPDFDirectoryLoader(data_folder)
docs = loader.load()

### View all the documents in the directory


In [6]:
pprint(docs)

[Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Macintosh)', 'creationdate': '2023-12-23T14:28:21+08:00', 'moddate': '2023-12-28T11:09:05+08:00', 'title': 'GREAT Golden Protector | Personal Accident Insurance | Great Eastern Singapore', 'trapped': '/False', 'source': '/Users/user/Documents/personal_projects/rag_insurance/data/ggp-web-brochure.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content='GREA T Golden \nProtector\nProtect your golden years with financial assurance\nPersonal accident\n25% off\npremiums \non 2nd life \nassured.\nT&Cs apply.'),
 Document(metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Macintosh)', 'creationdate': '2023-12-23T14:28:21+08:00', 'moddate': '2023-12-28T11:09:05+08:00', 'title': 'GREAT Golden Protector | Personal Accident Insurance | Great Eastern Singapore', 'trapped': '/False', 'source': '/Users/user/Documents/personal_projects/rag_insurance/data/ggp-web-bro

### Load all pages of the documents into one list

In [7]:
pages = []
async for page in loader.alazy_load():
    pages.append(page)

### View the first page metadata and page_content

In [8]:
print(f"{pages[10].metadata}\n")
print(pages[10].page_content)

{'producer': 'Acrobat Distiller 11.0 (Macintosh)', 'creator': 'Adobe Graphics Manager', 'creationdate': '2016-12-21T11:43:50+08:00', 'author': 'Mac 22', 'moddate': '2016-12-21T11:43:50+08:00', 'title': 'SupremeHealth Web_E.ai', 'source': '/Users/user/Documents/personal_projects/rag_insurance/data/supreme-health-standard-plan-brochure.pdf', 'total_pages': 7, 'page': 5, 'page_label': '6'}

Annual premium rates include the prevailing rate of GST. The prevailing rate of GST is subject to change.
Age Band
1 - 18
19 - 20
21 - 30
31 - 40
41 - 50
51 - 55
56 - 60
61 - 65
66 - 70
71 - 73
74 - 75
Premiums (S$)
146
163
163
201
215
279
380
501
655
854
854
Total Health Silver Premium Rates (Annual)9
Age Band
1 - 18
19 - 20
21 - 30
31 - 40
41 - 50
51 - 55
56 - 60
61 - 65
66 - 70
71 - 73
74 - 75
Advance (S$)
227.26
327.75
327.75
378
459
614.25
739.51
1,032
1,272.75
1,786.50
1,786.50
Essential (S$)
68.26
103.50
103.50
108.74
121.50
186.75
234.75
332.99
459.76
649.50
649.50
Total Health Plus Premium Rat

### Split the text 
- Here i am splitting the text with chunk_size of 200 and chunk_overlap of 10

In [9]:
# Split documents into chunks
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)
chunks = text_splitter.split_documents(pages)


### View all the chunks

In [10]:
for chunk in chunks:
    print(chunk)
    print('-------------------------')

page_content='GREA T Golden 
Protector
Protect your golden years with financial assurance
Personal accident
25% off
premiums 
on 2nd life 
assured.
T&Cs apply.' metadata={'producer': 'Adobe PDF Library 16.0.7', 'creator': 'Adobe InDesign 17.4 (Macintosh)', 'creationdate': '2023-12-23T14:28:21+08:00', 'moddate': '2023-12-28T11:09:05+08:00', 'title': 'GREAT Golden Protector | Personal Accident Insurance | Great Eastern Singapore', 'trapped': '/False', 'source': '/Users/user/Documents/personal_projects/rag_insurance/data/ggp-web-brochure.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}
-------------------------
page_content='Up to 300% payout and up to S$200 daily hospital cash benefit
Receive 3 times the payout upon Accidental Death or Permanent Disablement caused by an 
accident on public transportation or a private car‡. In the event of hospitalisation, you will get a 
cash benefit of up to S$200 per day, which also covers hospital stays due to COVID-19, Dengue 
Fever or other Inf

### Load embedding model from huggingface 
- To embed all the chunks 
- Here i am using 'all-MiniLM-L6-v2'

In [11]:
# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Create vector store
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

In [13]:
retriever = vectorstore.as_retriever(
    search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10}
)
result = retriever.invoke("Terminal Illness")

for content in result :
    print(content)
    print('-------------------------')

    

page_content=' up to age 70 or 85 and benefit from lifetime coverage.
Option to add-on critical illness coverage 
Enhance your coverage with optional DIRECT – GREAT Critical Care 70/ 85 rider 3 and be protected against 30 
critical illnesses.
1 Terminal Illness refers to a conclusive diagnosis of an illness that is expected to result in the death of the Life Assured within 12 months of 
the diagnosis. The terminal illness must be diagnosed by a registered medical practitioner and must be supported by evidence acceptable 
to the Company. Please refer to the policy contract for the full list of exclusions.
2 Total and Permanent Disability (TPD) refers to: 
 (a) The Life Assured, due to accident or sickness, is disabled to such an extent as to be rendered totally unable to engage in any occupation, 
business or activity for income, remuneration or profit; and the disability must continue uninterrupted for at least 6 consecutive months 
from the time' metadata={'creationdate': '2021-06-30T

In [14]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [16]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
compressor = CrossEncoderReranker(model=model, top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke("What is Terminal Illness")
pretty_print_docs(compressed_docs)

Document 1:

 up to age 70 or 85 and benefit from lifetime coverage.
Option to add-on critical illness coverage 
Enhance your coverage with optional DIRECT – GREAT Critical Care 70/ 85 rider 3 and be protected against 30 
critical illnesses.
1 Terminal Illness refers to a conclusive diagnosis of an illness that is expected to result in the death of the Life Assured within 12 months of 
the diagnosis. The terminal illness must be diagnosed by a registered medical practitioner and must be supported by evidence acceptable 
to the Company. Please refer to the policy contract for the full list of exclusions.
2 Total and Permanent Disability (TPD) refers to: 
 (a) The Life Assured, due to accident or sickness, is disabled to such an extent as to be rendered totally unable to engage in any occupation, 
business or activity for income, remuneration or profit; and the disability must continue uninterrupted for at least 6 consecutive months 
from the time
----------------------------------------

### Basic RAG using LLM (DeepSeek R1)

In [None]:
print(os.environ["HUGGINGFACEHUB_API_TOKEN"])

KeyError: 'HUGGINGFACEHUB_API_TOKEN'

In [18]:
from openai import OpenAI

client = OpenAI(
	base_url="https://router.huggingface.co/hyperbolic",
	api_key=os.environ["HUGGINGFACEHUB_API_TOKEN"] 
)

messages = [
	{
		"role": "user",
		"content": "What is the capital of France?"
	}
]

completion = client.chat.completions.create(
	model="deepseek-ai/DeepSeek-R1", 
	messages=messages, 
	max_tokens=500,
)

print(completion.choices[0].message)

KeyError: 'HUGGINGFACEHUB_API_TOKEN'