#### Install the pdf plumber
- https://pypi.org/project/pdfplumber/

In [1]:
# pip install pdfplumber
# conda install conda-forge::pdfplumber

In [2]:
# pip install sentence-transformers

### Import Libraries

In [3]:
import pdfplumber
from operator import itemgetter
import json
from pathlib import Path
import pandas as pd

### Extract Zip

In [4]:
from zipfile import ZipFile
zip_file = "Policy+Documents"
with ZipFile(f'{zip_file}.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall(path=f"{zip_file}")

### Explore PDF operations

In [5]:
def sample_pdf_operations():
    sample_pdf = "HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document"
    with pdfplumber.open(f"{zip_file}/{sample_pdf}.pdf") as pdf:
        singlePage = pdf.pages[0]
        text = singlePage.extract_text()
    
        table_page = pdf.pages[6]
        table = table_page.extract_tables()
        totalPages = pdf.pages
    
        print(len(totalPages))
        print("*"*100)
        print(text)
    
        print("*"*100)
        print(table)

sample_pdf_operations()

31
****************************************************************************************************
Part A
<<Date>>
<<Master Policyholder’s Name>>
<< Master Policyholder’s Address>>
<< Master Policyholder’s Contact Number>>
Dear << Master Policyholder’s Name>>,
Sub: Your Policy no. <<>>
We are glad to inform you that your proposal has been accepted and the HDFC Life Group Poorna Suraksha Policy
(“Master Policy”) being this Policy, has been issued. We have made every effort to design your Master Policy
Document in a simple format. We have highlighted items of importance so that you may recognize them easily.
Cancellation in the Free-Look Period:
In case you are not agreeable to any of the provisions stated in the Master Policy, you have the option to return the
Master Policy to us stating the reasons thereof, within 15 days (or 30 days in case the Master Policy has been issued
through distance marketing mode) from the date of receipt of the Master Policy. Provided the Scheme Member(

In [6]:
def sample_pdf_table_text_segration():
    sample_pdf = "HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1)"
    full_text = []
    p=0
    with pdfplumber.open(f"{zip_file}/{sample_pdf}.pdf") as pdf:
        sample_page = pdf.pages[31]
        sample_text = sample_page.extract_text
        sample_tables = sample_page.find_tables()
        sample_table_boxes = [i.bbox for i in sample_tables]
        sample_tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in sample_tables]
        sample_non_table_words = [word for word in sample_page.extract_words() if not any(
            [check_bboxes(word, tb_box) for tb_box in sample_table_boxes]
        )]
        
        lines = []
        for cluster in pdfplumber.utils.cluster_objects(sample_non_table_words + sample_tables, itemgetter('top'), tolerance=5):
                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))

        full_text.append([" ".join(lines)])
        p +=1
        return full_text


sample_pdf_table_text_segration()

[['Paid-up Addition Factor per Re. 1 Cash Bonus utilized for outstanding Policy Term of 37 to 54 years']]

### List all the files of the directory

In [7]:
import os
os.listdir(zip_file)

['HDFC-Life-Group-Term-Life-Policy.pdf',
 'HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf',
 'HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf',
 'HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf',
 'HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf',
 'HDFC-Surgicare-Plan-101N043V01.pdf',
 'HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf']

## Extract text and tables from the PDFs

In [8]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [9]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [10]:
sample_pdf = "HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1)"
extract_text_from_pdf(f"{zip_file}/{sample_pdf}.pdf")

[['Page 1',
  'PART A: Covering Letter with Policy Schedule __________________ <dd-mm-yyyy> __________________ __________________ __________________ __________________ __________________ Policy Number: __________________ Your <Policy Name> with Policy No. <Policy no.> Dear Mr./Ms.___________________________, We thank you for choosing HDFC Life Insurance as your preferred life insurance solution provider. We are pleased to enclose your Policy Bond, which carries the following details of your HDFC Life Insurance Policy: \uf0fc Policy Schedule : Summary of key features of your HDFC Life Insurance Policy \uf0fc Premium Receipt : Acknowledgement of the first Premium paid by you \uf0fc Terms & Conditions : Detailed terms of your Policy contract with HDFC Life Insurance \uf0fc Service Options : Wide range of Policy servicing options that you can Benefit from We request you to carefully go through the information given in this document. You are also advised to keep the Policy Bond with utmost 

In [11]:
# Define the directory containing the PDF files
pdf_directory = Path(zip_file)

# Initialize an empty list to store the extracted texts and document names
data = []

# Loop through all files in the directory
for pdf_path in pdf_directory.glob("*.pdf"):

    # Process the PDF file
    print(f"...Processing {pdf_path.name}")

    # Call the function to extract the text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)

    # Convert the extracted list to a PDF, and add a column to store document names
    extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])
    extracted_text_df['Document Name'] = pdf_path.name

    # Append the extracted text and document name to the list
    data.append(extracted_text_df)

    # Print a message to indicate progress
    print(f"Finished processing {pdf_path.name}")

# Print a message to indicate all PDFs have been processed
print("All PDFs have been processed.")

...Processing HDFC-Life-Group-Term-Life-Policy.pdf
Finished processing HDFC-Life-Group-Term-Life-Policy.pdf
...Processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf
Finished processing HDFC-Life-Smart-Pension-Plan-Policy-Document-Online.pdf
...Processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf
Finished processing HDFC-Life-Sanchay-Plus-Life-Long-Income-Option-101N134V19-Policy-Document.pdf
...Processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
Finished processing HDFC-Life-Easy-Health-101N110V03-Policy-Bond-Single-Pay.pdf
...Processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf
Finished processing HDFC-Life-Group-Poorna-Suraksha-101N137V02-Policy-Document.pdf
...Processing HDFC-Surgicare-Plan-101N043V01.pdf
Finished processing HDFC-Surgicare-Plan-101N043V01.pdf
...Processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Policy-Document (1).pdf
Finished processing HDFC-Life-Sampoorna-Jeevan-101N158V04-Poli

# Step 1: **Build the Vector store**
- Store Documents to Vector Store using the Embedding Models
- Read all PDF and store in dataframe
- Create a Gemini Model for Text to vector embedding
- Create Gemini Embedding function
- Create ChromaDB and assign Gemini Embedding fucntion 

## Store the PDF data to a pandas dataframe

In [12]:
insurance_pdfs_data = pd.concat(data, ignore_index=True)

In [13]:
insurance_pdfs_data.shape

(217, 3)

In [14]:
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,HDFC Life Group Term Life OF «OWNERNAME» Based...,HDFC-Life-Group-Term-Life-Policy.pdf
1,Page 2,PART A: Covering Letter with Policy Schedule <...,HDFC-Life-Group-Term-Life-Policy.pdf
2,Page 3,Address : Mobile/Landline Number : A1 Free Loo...,HDFC-Life-Group-Term-Life-Policy.pdf
3,Page 4,"[[""Name of Policy"", ""HDFC Life Group Term Life...",HDFC-Life-Group-Term-Life-Policy.pdf
4,Page 5,"[[""Policy No."", """"], [""Name & Address of Polic...",HDFC-Life-Group-Term-Life-Policy.pdf


In [15]:
insurance_pdfs_data.Page_Text[1]

'PART A: Covering Letter with Policy Schedule <dd-mm-yyyy> __________________ __________________ __________________ __________________ __________________ Your HDFC Life <Policy Name> with Policy No. <Policy no.> Dear Mr./Ms.___________________________, We thank you for choosing HDFC Life Insurance as your preferred life insurance solution provider.. We are pleased to enclose your Policy Bond, which carries the following details of your recently purchased HDFC Life Insurance Policy: \uf0fc Policy Schedule : Summary of key features of your HDFC Life Insurance Policy \uf0fc Premium Receipt : Acknowledgement of the first Premium paid by you \uf0fc Terms & Conditions : Detailed terms of your Policy contract with HDFC Life Insurance \uf0fc Service Options : Wide range of Policy servicing options that you can Benefit from We request you to carefully go through the information given in this document. You are also advised to keep the Policy Bond with utmost care and safety because this document

#### **Add a new columns `Text_Length`**
- Total number of words present in the document

In [16]:
# Length of the text so that we can drop empty pages
insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [17]:
insurance_pdfs_data['Text_Length']

0       57
1      400
2      618
3       67
4      174
      ... 
212      1
213      1
214      1
215      1
216      1
Name: Text_Length, Length: 217, dtype: int64

#### **Drop rows with less than 10 words**

In [18]:
insurance_pdfs_data = insurance_pdfs_data[insurance_pdfs_data["Text_Length"] >= 10]
insurance_pdfs_data.shape

(210, 4)

### Add Metadata to each row
- Metadata -> Filename and pagenumber

In [19]:
# Store the metadata for each page in a separate column
insurance_pdfs_data["Metadata"] = insurance_pdfs_data.apply(lambda x: {"Policy_Name": x["Document Name"][:-4], "Page_No": x["Page No."]}, axis=1)
insurance_pdfs_data.head()

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length,Metadata
0,Page 1,HDFC Life Group Term Life OF «OWNERNAME» Based...,HDFC-Life-Group-Term-Life-Policy.pdf,57,{'Policy_Name': 'HDFC-Life-Group-Term-Life-Pol...
1,Page 2,PART A: Covering Letter with Policy Schedule <...,HDFC-Life-Group-Term-Life-Policy.pdf,400,{'Policy_Name': 'HDFC-Life-Group-Term-Life-Pol...
2,Page 3,Address : Mobile/Landline Number : A1 Free Loo...,HDFC-Life-Group-Term-Life-Policy.pdf,618,{'Policy_Name': 'HDFC-Life-Group-Term-Life-Pol...
3,Page 4,"[[""Name of Policy"", ""HDFC Life Group Term Life...",HDFC-Life-Group-Term-Life-Policy.pdf,67,{'Policy_Name': 'HDFC-Life-Group-Term-Life-Pol...
4,Page 5,"[[""Policy No."", """"], [""Name & Address of Polic...",HDFC-Life-Group-Term-Life-Policy.pdf,174,{'Policy_Name': 'HDFC-Life-Group-Term-Life-Pol...


## Integrate with Gemini

In [20]:
gemini_key = open("API/gemini_key", "r").read()

In [21]:
import google.generativeai as genai
genai.configure(api_key=gemini_key)

In [22]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
client_db = chromadb.PersistentClient()

In [23]:
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        model="models/text-embedding-004"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

In [24]:
def insertToDBPipeline(insurance_pdfs_data):
    embeddings = []

    documents_list = insurance_pdfs_data['Page_Text'].tolist()
    metadata_list = insurance_pdfs_data['Metadata'].tolist()

    try:
        coll = client_db.get_collection(name="RAG_on_Insurance")
        client_db.delete_collection(name="RAG_on_Insurance")
        insurance_collection = client_db.get_or_create_collection(name="RAG_on_Insurance", embedding_function=GeminiEmbeddingFunction())
    except:
        insurance_collection = client_db.get_or_create_collection(name="RAG_on_Insurance", embedding_function=GeminiEmbeddingFunction())
    
    
    insurance_collection.add(
        documents=documents_list,
        ids=[str(i) for i in range(0, len(documents_list))],
        metadatas = metadata_list
    )
    
    return insurance_collection
    

In [25]:
insurance_collection = insertToDBPipeline(insurance_pdfs_data=insurance_pdfs_data)

In [26]:
insurance_collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': [[0.0351269468665123,
   0.03581498935818672,
   0.0030233401339501143,
   -0.02669384889304638,
   -0.017311669886112213,
   0.03252292796969414,
   -0.02813078835606575,
   0.024373896420001984,
   0.004699422977864742,
   0.028437305241823196,
   -0.033500198274850845,
   -0.020776502788066864,
   0.07399526238441467,
   0.0063785226084291935,
   -0.009089507162570953,
   -0.025050241500139236,
   -0.030402647331357002,
   0.02444591373205185,
   -0.11843248456716537,
   -0.032780155539512634,
   -0.04079224541783333,
   0.05425979197025299,
   -0.014723795466125011,
   0.007332495413720608,
   0.05335758253931999,
   -0.09024995565414429,
   -0.0024273162707686424,
   -0.020591476932168007,
   -0.023654749616980553,
   -0.10090244561433792,
   0.00770041486248374,
   0.01174316368997097,
   -0.016522834077477455,
   -0.0035519911907613277,
   0.05055362358689308,
   -0.008171522058546543,
   0.0005318635376170278,
   0.02004995569586754,
   0

### Cache the collection for already queried request response

In [27]:
def create_cache_collection():
    try:
        coll = client_db.get_collection(name="RAG_on_Insurance_cache")
        client_db.delete_collection(name="RAG_on_Insurance_cache")
        insurance_collection_cache = client_db.get_or_create_collection(name="RAG_on_Insurance_cache", embedding_function=GeminiEmbeddingFunction())
    except:
        insurance_collection_cache = client_db.get_or_create_collection(name="RAG_on_Insurance_cache", embedding_function=GeminiEmbeddingFunction())

    return insurance_collection_cache

In [28]:
insurance_collection_cache = create_cache_collection()

# Step 2: **Cache Search and Rerank**

In [29]:
# client_db.delete_collection(name="RAG_on_Insurance_cache")
# client_db.delete_collection(name="RAG_on_Insurance")

In [47]:
query = "What are the benefits of accidental insurance?"

### Semantic search in vector Database
- Take the query and check if a similar query is already present in cache
- we are using `distance` threshold 0.2 to maintain accuracy
- If the query is present then extract it from the cache and return the results
- If it is not present in cache then search in the actual database
- Store the results in Cache db along with the query
- Return the results

In [31]:
def get_results(query):
    threshold = 0.2
    
    cache_result = insurance_collection_cache.query(
                        query_texts = query,
                        n_results = 1
                    )

    if cache_result["distances"][0] == [] or cache_result["distances"][0][0] > threshold:
        query_result = insurance_collection.query(
                            query_texts=query,
                            n_results=10
                        )
        Keys = []
        Values = []
        for key, val in query_result.items():
            if val is None:
                continue
            for i in range(10):
                try:
                    Keys.append(str(key) + str(i))
                    Values.append(str(val[0][i]))
                except:
                    continue
    
        insurance_collection_cache.add(
            documents=[query],
            ids=[query],
            metadatas=dict(zip(Keys, Values))
        )
        print("Result is not found in cache")
        result_dict = {
            "Metadatas" : query_result["metadatas"][0], 
            "Documents": query_result["documents"][0],
            "Distances": query_result["distances"][0],
            "IDs":query_result["ids"][0]
        }
        result_df = pd.DataFrame.from_dict(result_dict)
        return result_df
    elif cache_result['distances'][0][0] <= threshold:
        cache_result_dict = cache_result['metadatas'][0][0]
        ids = []
        documents = []
        distances = []
        metadatas = []

        for key, value in cache_result_dict.items():
            if "ids" in key:
                ids.append(value)
            elif "documents" in key:
                documents.append(value)
            elif "distances" in key:
                distances.append(value)
            elif "metadatas" in key:
                metadatas.append(value)

        print("Results found in cache")
        # Create a DataFrame
        result_df = pd.DataFrame({
            'IDs': ids,
            'Documents': documents,
            'Distances': distances,
            'Metadatas': metadatas
        })

        return result_df
    return None

In [48]:
results_df = get_results(query=query)

Result is not found in cache


In [49]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Page_No': 'Page 15', 'Policy_Name': 'HDFC-Li...","Note: For the purpose of waiting period, Date ...",0.679257,140
1,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",PART B Important Terms and Definitions B1 Defi...,0.699556,6
2,"{'Page_No': 'Page 11', 'Policy_Name': 'HDFC-Li...",PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ...,0.69972,10
3,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",Part C 1. Benefits: (1) Benefits on Death or d...,0.721522,132
4,"{'Page_No': 'Page 17', 'Policy_Name': 'HDFC-Li...","7. Routine eye tests, any Dental Treatment or ...",0.723769,109
5,"{'Page_No': 'Page 21', 'Policy_Name': 'HDFC-Li...",F.26. Issuance of Duplicate Policy / Certifica...,0.737621,20
6,"{'Page_No': 'Page 10', 'Policy_Name': 'HDFC-Li...","[[""Plan option"", ""Benefits covered""], [""A"", ""D...",0.745836,103
7,"{'Page_No': 'Page 11', 'Policy_Name': 'HDFC-Li...",v. In case 100% of the Sum Insured has been us...,0.747574,104
8,"{'Page_No': 'Page 34', 'Policy_Name': 'HDFC-Li...",Paid-up Addition Factor per Re. 1 Cash Bonus u...,0.748789,205
9,"{'Page_No': 'Page 4', 'Policy_Name': 'HDFC-Lif...",Part B Definitions The following capitalized t...,0.762946,129


### Reranking the search results using the `CrossEncoder`
- Use the `CrossEncoder` from `sentence_transformers`
- Rank the results based on cross encoder rankings
- High rank is better than the low rank in CrossEncoder
- Cross Encoder takes the array of `Query` and `Response` and then compares the results and rank them as per relation to the query

In [34]:
from sentence_transformers import CrossEncoder, util

In [35]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')



#### Sample example of Cross Encoder

In [36]:
# Test the cross encoder model

scores = cross_encoder.predict([['Does the insurance cover diabetic patients?', 'The insurance policy covers some pre-existing conditions including diabetes, heart diseases, etc. The policy does not howev'],
                                ['Does the insurance cover diabetic patients?', 'The premium rates for various age groups are given as follows. Age group (<18 years): Premium rate']])

In [37]:
scores

array([  3.8467631, -11.25288  ], dtype=float32)

#### Observations:
- Higher score means better results

### Use CrossEncoder to rerank the output from semantic search

In [50]:
cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)
cross_rerank_scores

array([  1.3488362 ,  -5.79852   ,  -1.3450518 ,   0.07420602,
        -5.413535  ,  -9.65161   ,  -8.289799  ,  -6.9327583 ,
       -11.356976  ,  -0.46398807], dtype=float32)

In [51]:
results_df['Reranked_scores'] = cross_rerank_scores

In [52]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No': 'Page 15', 'Policy_Name': 'HDFC-Li...","Note: For the purpose of waiting period, Date ...",0.679257,140,1.348836
1,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",PART B Important Terms and Definitions B1 Defi...,0.699556,6,-5.79852
2,"{'Page_No': 'Page 11', 'Policy_Name': 'HDFC-Li...",PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ...,0.69972,10,-1.345052
3,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",Part C 1. Benefits: (1) Benefits on Death or d...,0.721522,132,0.074206
4,"{'Page_No': 'Page 17', 'Policy_Name': 'HDFC-Li...","7. Routine eye tests, any Dental Treatment or ...",0.723769,109,-5.413535
5,"{'Page_No': 'Page 21', 'Policy_Name': 'HDFC-Li...",F.26. Issuance of Duplicate Policy / Certifica...,0.737621,20,-9.65161
6,"{'Page_No': 'Page 10', 'Policy_Name': 'HDFC-Li...","[[""Plan option"", ""Benefits covered""], [""A"", ""D...",0.745836,103,-8.289799
7,"{'Page_No': 'Page 11', 'Policy_Name': 'HDFC-Li...",v. In case 100% of the Sum Insured has been us...,0.747574,104,-6.932758
8,"{'Page_No': 'Page 34', 'Policy_Name': 'HDFC-Li...",Paid-up Addition Factor per Re. 1 Cash Bonus u...,0.748789,205,-11.356976
9,"{'Page_No': 'Page 4', 'Policy_Name': 'HDFC-Lif...",Part B Definitions The following capitalized t...,0.762946,129,-0.463988


In [53]:
top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No': 'Page 15', 'Policy_Name': 'HDFC-Li...","Note: For the purpose of waiting period, Date ...",0.679257,140,1.348836
1,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",PART B Important Terms and Definitions B1 Defi...,0.699556,6,-5.79852
2,"{'Page_No': 'Page 11', 'Policy_Name': 'HDFC-Li...",PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ...,0.69972,10,-1.345052


In [54]:
top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Page_No': 'Page 15', 'Policy_Name': 'HDFC-Li...","Note: For the purpose of waiting period, Date ...",0.679257,140,1.348836
3,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif...",Part C 1. Benefits: (1) Benefits on Death or d...,0.721522,132,0.074206
9,"{'Page_No': 'Page 4', 'Policy_Name': 'HDFC-Lif...",Part B Definitions The following capitalized t...,0.762946,129,-0.463988


In [55]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]
top_3_RAG

Unnamed: 0,Documents,Metadatas
0,"Note: For the purpose of waiting period, Date ...","{'Page_No': 'Page 15', 'Policy_Name': 'HDFC-Li..."
3,Part C 1. Benefits: (1) Benefits on Death or d...,"{'Page_No': 'Page 7', 'Policy_Name': 'HDFC-Lif..."
9,Part B Definitions The following capitalized t...,"{'Page_No': 'Page 4', 'Policy_Name': 'HDFC-Lif..."


# Step 3: **RAG: Generative Search using the Gemini model**
- Take the input from the Step 2 results
- Write a prompt to generate a human consumable output
- Present the output to the user

In [58]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, results_df):
    """
    Generate a response using Gemini ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "model", "parts":  ["You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."]},
                {"role": "model", "parts": [f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{results_df}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """]},
                 {"role": "user", "parts": [query]}
              ]

    model = genai.GenerativeModel('gemini-1.5-flash')
    gen_config = genai.types.GenerationConfig(candidate_count=1)
    new_response = model.generate_content(messages, 
                       generation_config=gen_config)

    return new_response.candidates[0].content.parts[0].text.split('\n')



In [59]:
response = generate_response(query, top_3_RAG)

In [60]:
response

['Accidental insurance can provide financial support to you or your family in case of an unexpected accident.  It can cover medical expenses, disability benefits, and even death benefits.  ',
 '',
 'Here are some of the key benefits:',
 '',
 '* **Medical Expenses:**  Accidental insurance can cover medical expenses incurred due to an accident, including hospitalization, surgery, and rehabilitation.',
 '* **Disability Benefits:** If an accident leaves you disabled, accidental insurance can provide a monthly income to help with your living expenses.',
 '* **Death Benefits:** If the accident results in death, the policy can provide a lump sum payment to your beneficiary, which can be used to cover funeral expenses, outstanding debts, and support your family.',
 '* **Accidental Death and Dismemberment Benefits:** This benefit provides a lump sum payment if you suffer a specific accidental loss like a loss of limb or eyesight.',
 '',
 "It's important to note that the specific benefits and co

# Generate results with working pipeline

In [65]:
def working_pipeline(final_query):
    results_df = get_results(final_query)

    cross_inputs = [[query, response] for response in results_df['Documents']]
    cross_rerank_scores = cross_encoder.predict(cross_inputs)
    results_df['Reranked_scores'] = cross_rerank_scores

    top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)[["Documents", "Metadatas"]][:3]
    
    rag_result = [[generate_response(final_query, top_3_rerank)] for i in range(3)]
    return rag_result
    

In [62]:
final_query = "What are the accidental death benefits?"
result = working_pipeline(final_query)
result

Result is not found in cache


['Accidental death benefits are a type of insurance coverage that pays out a lump sum benefit to your beneficiaries if you die as a result of an accident. The amount of the benefit will depend on the policy and the specifics of your accident.',
 '',
 '**Citations:**',
 '* **HDFC-Life Click 2 Protect Plus Policy**, Page 7',
 '* **HDFC-Life Click 2 Protect Plus Policy**, Page 11 ',
 '']

In [63]:
final_query = "What are the benefits of accidental death?"
result = working_pipeline(final_query)
result

Results found in cache


['Accidental death benefits provide a lump sum payout to your beneficiaries in the event of your death due to an accident.  This benefit can help cover funeral expenses, outstanding debts, or provide financial support to your loved ones. ',
 '',
 '**Citation:** ',
 '',
 '* HDFC-Life Click 2 Protect Plus, Page 7, Part C, Benefits: (1) Benefits on Death or disability. ',
 '* HDFC-Life Click 2 Protect Plus, Page 11, PART C PRODUCT CORE BENEFITS BENEFITS PAYABLE ON DEATH ',
 '']

In [66]:
final_query = "What are the benefits of accidental death in insurance?"
result = working_pipeline(final_query)
result

Results found in cache


[[['Accidental death benefits in insurance policies provide a lump sum payment to your beneficiaries if you die as a result of an accident. This benefit can help your loved ones cover expenses such as funeral costs, outstanding debts, or lost income. ',
   '',
   '**Here are some examples of what an accidental death benefit might cover:**',
   '',
   '| Benefit                                  | Description                                                                                        |',
   '|-----------------------------------------|---------------------------------------------------------------------------------------------------|',
   '| Funeral Expenses                        | Covers the costs of a funeral and burial.                                                                  |',
   '| Outstanding Debts                      | Helps pay off any debts you may have, such as credit card bills, mortgages, or loans.            |',
   '| Lost Income                        