#### Importing required libraries

In [1]:
!pip install -U -q pdfplumber sentence-transformers

In [2]:
import pdfplumber
import pandas as pd
from operator import itemgetter

#### Reading PDF file

In [3]:
import os

# Open the PDF file and print the text
pdf_path = "Principal-Sample-Life-Insurance-Policy.pdf"
print(os.path.abspath(pdf_path))
print("Exists:", os.path.isfile(pdf_path))

C:\Users\Lenovo\Desktop\Mr_HelpMate_AI\Principal-Sample-Life-Insurance-Policy.pdf
Exists: True


In [4]:
with pdfplumber.open(pdf_path) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[0]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

In [5]:
# Print the extracted text
print(text)

DOROTHEA GLAUSE S655
RHODE ISLAND JOHN DOE 01/01/2014
711 HIGH STREET
GEORGE RI 02903
GROUP POLICY FOR:
RHODE ISLAND JOHN DOE
ALL MEMBERS
Group Member Life Insurance
Print Date: 07/16/2014


In [6]:
# Print the extracted table
print(tables)

[]


In [7]:
# Function to check whether a word is present in a table or not for segregation of regular text and tables

def check_bboxes(word, table_bbox):
    # Check whether word is inside a table bbox.
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]

In [8]:
# Function to extract text from a PDF file.

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [9]:
# Initialize an empty list to store the extracted texts and document names
data = []

# Call the function to extract the text from the PDF
extracted_text = extract_text_from_pdf(pdf_path)

# Convert the extracted list to a PDF, and add a column to store document names
extracted_text_df = pd.DataFrame(extracted_text, columns=['Page No.', 'Page_Text'])

# Append the extracted text and document name to the list
data.append(extracted_text_df)

# Print a message to indicate all PDFs have been processed
print("PDF have been processed.")

PDF have been processed.


In [10]:
data

[   Page No.                                          Page_Text
 0    Page 1  DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
 1    Page 2                 This page left blank intentionally
 2    Page 3  POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
 3    Page 4                 This page left blank intentionally
 4    Page 5  PRINCIPAL LIFE INSURANCE COMPANY (called The P...
 ..      ...                                                ...
 59  Page 60  I f a Dependent who was insured dies during th...
 60  Page 61  Section D - Claim Procedures Article 1 - Notic...
 61  Page 62  A claimant may request an appeal of a claim de...
 62  Page 63                 This page left blank intentionally
 63  Page 64  Principal Life Insurance Company Des Moines, I...
 
 [64 rows x 2 columns]]

In [11]:
insurance_pdfs_data = pd.concat(data, ignore_index=True)
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
1,Page 2,This page left blank intentionally
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
3,Page 4,This page left blank intentionally
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...
...,...,...
59,Page 60,I f a Dependent who was insured dies during th...
60,Page 61,Section D - Claim Procedures Article 1 - Notic...
61,Page 62,A claimant may request an appeal of a claim de...
62,Page 63,This page left blank intentionally


In [12]:
# Store the metadata for each page in a separate column
insurance_pdfs_data['Metadata'] = insurance_pdfs_data.apply(lambda x: {'Page_No.': x['Page No.']}, axis=1)
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,{'Page_No.': 'Page 1'}
1,Page 2,This page left blank intentionally,{'Page_No.': 'Page 2'}
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,{'Page_No.': 'Page 3'}
3,Page 4,This page left blank intentionally,{'Page_No.': 'Page 4'}
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,{'Page_No.': 'Page 5'}
...,...,...,...
59,Page 60,I f a Dependent who was insured dies during th...,{'Page_No.': 'Page 60'}
60,Page 61,Section D - Claim Procedures Article 1 - Notic...,{'Page_No.': 'Page 61'}
61,Page 62,A claimant may request an appeal of a claim de...,{'Page_No.': 'Page 62'}
62,Page 63,This page left blank intentionally,{'Page_No.': 'Page 63'}


In [13]:
insurance_pdfs_data.Page_Text[0]

'DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014'

In [14]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop
insurance_pdfs_data['Text_Length'] = insurance_pdfs_data['Page_Text'].apply(lambda x: len(x.split(' ')))

In [15]:
insurance_pdfs_data['Text_Length']

0      30
1       5
2     230
3       5
4     110
     ... 
59    285
60    418
61    322
62      5
63      8
Name: Text_Length, Length: 64, dtype: int64

In [16]:
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Metadata,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,{'Page_No.': 'Page 1'},30
1,Page 2,This page left blank intentionally,{'Page_No.': 'Page 2'},5
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,{'Page_No.': 'Page 3'},230
3,Page 4,This page left blank intentionally,{'Page_No.': 'Page 4'},5
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,{'Page_No.': 'Page 5'},110
...,...,...,...,...
59,Page 60,I f a Dependent who was insured dies during th...,{'Page_No.': 'Page 60'},285
60,Page 61,Section D - Claim Procedures Article 1 - Notic...,{'Page_No.': 'Page 61'},418
61,Page 62,A claimant may request an appeal of a claim de...,{'Page_No.': 'Page 62'},322
62,Page 63,This page left blank intentionally,{'Page_No.': 'Page 63'},5


#### Creating Chunks

In [17]:
# Function to split text into fixed-size chunks
def split_text_into_chunks(text, chunk_size):
    chunks = []
    words = text.split()  # Split the text into words

    current_chunk = []  # Store words for the current chunk
    current_chunk_word_count = 0  # Count of words in the current chunk

    for word in words:
        if current_chunk_word_count + len(word) + 1 <= chunk_size:
            current_chunk.append(word)
            current_chunk_word_count += len(word) + 1
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_chunk_word_count = len(word)

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [18]:
def process_page(page_no):
    page = insurance_pdfs_data[insurance_pdfs_data['Page No.'] == page_no].Page_Text.values[0]
    metadata = insurance_pdfs_data[insurance_pdfs_data['Page No.'] == page_no].Metadata.values[0]

    if page is not None:
        # setting chunk size as 500
        chunk_size = 500
        text_chunks = split_text_into_chunks(page, chunk_size)

        # Creating a DataFrame to store the chunks, page title and page metadata
        data = {'Title': [], 'Chunk Text': [], 'Metadata': []}

        for index, chunk in enumerate(text_chunks):
            data['Title'].append(page_no)
            data['Chunk Text'].append(chunk)
            # adding chunk no as part of metadata
            metadata['Chunk_No.'] = index
            data['Metadata'].append(metadata)

        return pd.DataFrame(data)

In [19]:
page_nos = insurance_pdfs_data["Page No."]

In [20]:
# creating a dataframe after calling process
all_dfs = []
for page_no in page_nos:
    df = process_page(page_no)
    if df is not None:
        all_dfs.append(df)

fixed_chunk_df = pd.concat(all_dfs, ignore_index=True)
fixed_chunk_df

Unnamed: 0,Title,Chunk Text,Metadata
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}"
1,Page 2,This page left blank intentionally,"{'Page_No.': 'Page 2', 'Chunk_No.': 0}"
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
3,Page 3,arrange for third party service providers (i.e...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
4,Page 3,the provision of such goods and/or services no...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}"
...,...,...,...
226,Page 62,"requested additional information, The Principa...","{'Page_No.': 'Page 62', 'Chunk_No.': 3}"
227,Page 62,may have the Member or Dependent whose loss is...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}"
228,Page 62,proof of loss has been filed and before the ap...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}"
229,Page 63,This page left blank intentionally,"{'Page_No.': 'Page 63', 'Chunk_No.': 0}"


#### Generating embeddings

In [21]:
# Install the sentence transformers library
!pip install -q -u sentence-transformers


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -u


In [22]:
# From sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction


In [23]:
# Load the embedding model
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")


In [24]:
# Function to generate embeddings for text
def generate_embeddings(texts):
    embeddings = embedding_function(texts)
    return embeddings

In [25]:
# Function to generate embedding on dataframe
def generate_embeddings_on_df(df):
  df['Embeddings'] = df['Chunk Text'].apply(lambda x: generate_embeddings([x])[0])

In [26]:
# Create embeddings for 'Chunk Text' column on all three dataframes
generate_embeddings_on_df(fixed_chunk_df)

In [27]:
# print the dataframe
fixed_chunk_df

Unnamed: 0,Title,Chunk Text,Metadata,Embeddings
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,"{'Page_No.': 'Page 1', 'Chunk_No.': 0}","[-0.025921918, 0.047777485, 0.05585774, 0.0423..."
1,Page 2,This page left blank intentionally,"{'Page_No.': 'Page 2', 'Chunk_No.': 0}","[0.029118938, 0.06057411, 0.04641532, 0.037792..."
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}","[-0.06453799, 0.043197103, -8.384172e-05, -0.0..."
3,Page 3,arrange for third party service providers (i.e...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}","[-0.10200984, -0.028467676, -0.02056504, -0.04..."
4,Page 3,the provision of such goods and/or services no...,"{'Page_No.': 'Page 3', 'Chunk_No.': 2}","[-0.09000838, 0.076582074, 0.0049276566, -0.08..."
...,...,...,...,...
226,Page 62,"requested additional information, The Principa...","{'Page_No.': 'Page 62', 'Chunk_No.': 3}","[-0.047671925, 0.11277697, 0.06906492, -0.0508..."
227,Page 62,may have the Member or Dependent whose loss is...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}","[-0.079419956, 0.14404444, 0.03187609, -0.0655..."
228,Page 62,proof of loss has been filed and before the ap...,"{'Page_No.': 'Page 62', 'Chunk_No.': 3}","[-0.14206006, 0.1236838, 0.120924726, -0.01142..."
229,Page 63,This page left blank intentionally,"{'Page_No.': 'Page 63', 'Chunk_No.': 0}","[0.029118938, 0.06057411, 0.04641532, 0.037792..."


#### Storing embeddings in ChromaDB

In [28]:
# install chromaDB
!pip install chromadb



In [29]:
# Define the path where chroma collections will be stored
chroma_data_path = 'ChromaDB_Data'

In [30]:
import chromadb

# Call PersistentClient()
client = chromadb.PersistentClient(path=chroma_data_path)

In [32]:
# Create a collection to store the embeddings.
client.delete_collection("insurance-collection")

collection = client.create_collection(
    name="insurance-collection",
    embedding_function=embedding_function
)

In [33]:
collection.add(
    embeddings = fixed_chunk_df['Embeddings'].to_list(),
    documents = fixed_chunk_df['Chunk Text'].to_list(),
    metadatas = fixed_chunk_df['Metadata'].to_list(),
    ids = [str(i) for i in range(0, len(fixed_chunk_df['Embeddings']))]
)

In [34]:
# get few of data by ids from collection
collection.get(
    ids = ['0','1','2'],
    include = ['embeddings', 'documents', 'metadatas']
)

{'ids': ['0', '1', '2'],
 'embeddings': array([[-2.59219185e-02,  4.77774851e-02,  5.58577403e-02, ...,
         -4.93265502e-02, -5.85114509e-02,  2.35519167e-02],
        [ 2.91189384e-02,  6.05741106e-02,  4.64153215e-02, ...,
          5.95401786e-02, -2.83837337e-02,  5.31934854e-03],
        [-6.45379871e-02,  4.31971028e-02, -8.38417182e-05, ...,
         -3.78734246e-02,  1.79674458e-02, -7.36602023e-03]]),
 'documents': ['DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY FOR: RHODE ISLAND JOHN DOE ALL MEMBERS Group Member Life Insurance Print Date: 07/16/2014',
  'This page left blank intentionally',
  'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Princi

In [35]:
# create a cache collection
client.delete_collection("insurance-collection-cache")

cache_collection = client.create_collection(
    name="insurance-collection-cache",
    embedding_function=embedding_function
)

In [36]:
# peek few of elements from cache collection
cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

#### Semantic search with cache

In [37]:
# Read the user query
query = input()

 what is the life insurance coverage for disability


In [38]:
# Searh the Cache collection first
# Query the collection against the user query and return the top 20 results
cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

In [39]:
# get result from cache collection
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

In [40]:
# get result from main collection
results = collection.query(
query_texts=query,
n_results=10
)
print("Result size is : " + str(len(results.items())))
results.items()

Result size is : 8


dict_items([('ids', [['143', '171', '147', '151', '183', '146', '180', '85', '179', '142']]), ('embeddings', None), ('documents', [['Member Life Insurance or Coverage During Disability terminates under this Group Policy. This policy has been updated effective January 1, 2014 PART III - INDIVIDUAL REQUIREMENTS AND RIGHTS GC 6011 Section F - Individual Purchase Rights, Page 1', "Payment of benefits will be subject to the Beneficiary and Facility of Payment provisions of this PART IV, Section A. Article 6 - Member Life Insurance - Coverage During Disability A Member may be eligible to continue his or her Member Life and Member Accidental Death and Dismemberment Insurance and Dependent Life Insurance coverage during the Member's ADL Disability or Total Disability. a. Coverage Qualification To be qualified for Coverage During Disability, a Member must: (1) become ADL", 'any Accelerated Benefit payment as described in PART IV, Section A, Article 7. Article 2 - Dependent Life Insurance a. Ind

In [41]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()


# If the distance is greater than the threshold, then return the results from the main collection.

if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
      # Query the collection against the user query and return the top 10 results
      results = collection.query(
      query_texts=query,
      n_results=10
      )

      # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
      # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
      Keys = []
      Values = []

      size = len(results.items())

      for key, val in results.items():
        if val is None:
          continue
        for i in range(size):
          Keys.append(str(key)+str(i))
          Values.append(str(val[0][i]))


      cache_collection.add(
          documents= [query],
          ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
          metadatas = dict(zip(Keys, Values))
      )

      print("Not found in cache. Found in main collection.")

      result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
      results_df = pd.DataFrame.from_dict(result_dict)
      results_df


# If the distance is, however, less than the threshold, you can return the results from cache

elif cache_results['distances'][0][0] <= threshold:
      cache_result_dict = cache_results['metadatas'][0][0]

      # Loop through each inner list and then through the dictionary
      for key, value in cache_result_dict.items():
          if 'ids' in key:
              ids.append(value)
          elif 'documents' in key:
              documents.append(value)
          elif 'distances' in key:
              distances.append(value)
          elif 'metadatas' in key:
              metadatas.append(value)

      print("Found in cache!")

      # Create a DataFrame
      results_df = pd.DataFrame({
        'IDs': ids,
        'Documents': documents,
        'Distances': distances,
        'Metadatas': metadatas
      })


Not found in cache. Found in main collection.


In [42]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,"{'Chunk_No.': 4, 'Page_No.': 'Page 42'}",Member Life Insurance or Coverage During Disab...,0.356206,143
1,"{'Page_No.': 'Page 49', 'Chunk_No.': 4}",Payment of benefits will be subject to the Ben...,0.401376,171
2,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",any Accelerated Benefit payment as described i...,0.4301,147
3,"{'Chunk_No.': 4, 'Page_No.': 'Page 44'}",Dependent's Life Insurance terminates because ...,0.436482,151
4,"{'Page_No.': 'Page 51', 'Chunk_No.': 3}",disability that: (1) results from willful self...,0.44899,183
5,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",be the Coverage During Disability benefit in f...,0.454232,146
6,"{'Page_No.': 'Page 50', 'Chunk_No.': 5}",Total Disability began. Failure to give Writte...,0.455117,180
7,"{'Chunk_No.': 4, 'Page_No.': 'Page 28'}","terms of the Prior Policy, to have their premi...",0.456296,85
8,"{'Chunk_No.': 5, 'Page_No.': 'Page 50'}","Disability is in force, The Principal will pay...",0.462541,179
9,"{'Page_No.': 'Page 42', 'Chunk_No.': 4}","Premium Waiver Period as described in PART IV,...",0.466407,142


#### Re-ranking with cross encoder

In [44]:
from sentence_transformers import CrossEncoder

In [45]:
# Initialise the cross encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [46]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [47]:
# pritn the cross rerank scores
cross_rerank_scores

array([ 3.053526  ,  4.810115  ,  0.5387703 ,  2.9317625 , -0.9036653 ,
        1.4724652 ,  0.26232353, -1.3536788 ,  2.446186  , -2.660254  ],
      dtype=float32)

In [48]:
results_df['Reranked_scores'] = cross_rerank_scores

In [49]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 4, 'Page_No.': 'Page 42'}",Member Life Insurance or Coverage During Disab...,0.356206,143,3.053526
1,"{'Page_No.': 'Page 49', 'Chunk_No.': 4}",Payment of benefits will be subject to the Ben...,0.401376,171,4.810115
2,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",any Accelerated Benefit payment as described i...,0.4301,147,0.53877
3,"{'Chunk_No.': 4, 'Page_No.': 'Page 44'}",Dependent's Life Insurance terminates because ...,0.436482,151,2.931762
4,"{'Page_No.': 'Page 51', 'Chunk_No.': 3}",disability that: (1) results from willful self...,0.44899,183,-0.903665
5,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",be the Coverage During Disability benefit in f...,0.454232,146,1.472465
6,"{'Page_No.': 'Page 50', 'Chunk_No.': 5}",Total Disability began. Failure to give Writte...,0.455117,180,0.262324
7,"{'Chunk_No.': 4, 'Page_No.': 'Page 28'}","terms of the Prior Policy, to have their premi...",0.456296,85,-1.353679
8,"{'Chunk_No.': 5, 'Page_No.': 'Page 50'}","Disability is in force, The Principal will pay...",0.462541,179,2.446186
9,"{'Page_No.': 'Page 42', 'Chunk_No.': 4}","Premium Waiver Period as described in PART IV,...",0.466407,142,-2.660254


In [50]:
# Return the top 5 results from semantic search

top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:5]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,"{'Chunk_No.': 4, 'Page_No.': 'Page 42'}",Member Life Insurance or Coverage During Disab...,0.356206,143,3.053526
1,"{'Page_No.': 'Page 49', 'Chunk_No.': 4}",Payment of benefits will be subject to the Ben...,0.401376,171,4.810115
2,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",any Accelerated Benefit payment as described i...,0.4301,147,0.53877
3,"{'Chunk_No.': 4, 'Page_No.': 'Page 44'}",Dependent's Life Insurance terminates because ...,0.436482,151,2.931762
4,"{'Page_No.': 'Page 51', 'Chunk_No.': 3}",disability that: (1) results from willful self...,0.44899,183,-0.903665


In [51]:
# Return the top 5 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:5]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
1,"{'Page_No.': 'Page 49', 'Chunk_No.': 4}",Payment of benefits will be subject to the Ben...,0.401376,171,4.810115
0,"{'Chunk_No.': 4, 'Page_No.': 'Page 42'}",Member Life Insurance or Coverage During Disab...,0.356206,143,3.053526
3,"{'Chunk_No.': 4, 'Page_No.': 'Page 44'}",Dependent's Life Insurance terminates because ...,0.436482,151,2.931762
8,"{'Chunk_No.': 5, 'Page_No.': 'Page 50'}","Disability is in force, The Principal will pay...",0.462541,179,2.446186
5,"{'Page_No.': 'Page 43', 'Chunk_No.': 4}",be the Coverage During Disability benefit in f...,0.454232,146,1.472465


In [52]:
top_3_RAG = top_3_rerank[["Documents", "Metadatas"]][:3]
top_3_RAG

Unnamed: 0,Documents,Metadatas
1,Payment of benefits will be subject to the Ben...,"{'Page_No.': 'Page 49', 'Chunk_No.': 4}"
0,Member Life Insurance or Coverage During Disab...,"{'Chunk_No.': 4, 'Page_No.': 'Page 42'}"
3,Dependent's Life Insurance terminates because ...,"{'Chunk_No.': 4, 'Page_No.': 'Page 44'}"


#### Retrieval Augmented Generation (RAG)

In [None]:
# Set the API key
!pip install openai
from openai import OpenAI

client = OpenAI(
    api_key="****************",
    base_url="https://api.perplexity.ai"
)



#### Testing queries

In [56]:
# Define the function to generate the response

def generate_response(query, results_df):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
                {"role": "system", "content":  "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
                {"role": "user", "content": f"""You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
                                                You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

                                                The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

                                                Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

                                                Follow the guidelines below when performing the task.
                                                1. Try to provide relevant/accurate numbers if available.
                                                2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
                                                3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular in format.
                                                3. Use the Metadatas columns in the dataframe to retrieve and cite the policy name(s) and page numbers(s) as citation.
                                                4. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
                                                5. You are a customer facing assistant, so do not provide any information on internal workings, just answer the query directly.

                                                The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.
                                                """},
              ]

    response = client.chat.completions.create(
        model="sonar-pro",
        messages=messages,
        temperature=0,
    )

    return response.choices[0].message.content.split('\n')

In [57]:
def search(query):

  # Set a threshold for cache search
  threshold = 0.2

  ids = []
  documents = []
  distances = []
  metadatas = []
  results_df = pd.DataFrame()

  # try to find from cache
  cache_results = cache_collection.query(
      query_texts=query,
      n_results=1
  )

  # If the distance is greater than the threshold, then return the results from the main collection.
  if cache_results['distances'][0] == [] or cache_results['distances'][0][0] > threshold:
        # Query the collection against the user query and return the top 10 results
        results = collection.query(
        query_texts=query,
        n_results=10
        )

        # Store the query in cache_collection as document w.r.t to ChromaDB so that it can be embedded and searched against later
        # Store retrieved text, ids, distances and metadatas in cache_collection as metadatas, so that they can be fetched easily if a query indeed matches to a query in cache
        Keys = []
        Values = []

        size = len(results.items())

        for key, val in results.items():
          if val is None:
            continue
          for i in range(size):
            Keys.append(str(key)+str(i))
            Values.append(str(val[0][i]))


        cache_collection.add(
            documents= [query],
            ids = [query],  # Or if you want to assign integers as IDs 0,1,2,.., then you can use "len(cache_results['documents'])" as will return the no. of queries currently in the cache and assign the next digit to the new query."
            metadatas = dict(zip(Keys, Values))
        )

        #print("Not found in cache. Found in main collection.")

        result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
        results_df = pd.DataFrame.from_dict(result_dict)
        return results_df


  # If the distance is, however, less than the threshold, you can return the results from cache

  elif cache_results['distances'][0][0] <= threshold:
        cache_result_dict = cache_results['metadatas'][0][0]

        # Loop through each inner list and then through the dictionary
        for key, value in cache_result_dict.items():
            if 'ids' in key:
                ids.append(value)
            elif 'documents' in key:
                documents.append(value)
            elif 'distances' in key:
                distances.append(value)
            elif 'metadatas' in key:
                metadatas.append(value)

        #print("Found in cache!")

        # Create a DataFrame
        return pd.DataFrame({
          'IDs': ids,
          'Documents': documents,
          'Distances': distances,
          'Metadatas': metadatas
        })

In [58]:
def apply_cross_encoder(query, df):
  cross_inputs = [[query, response] for response in df['Documents']]
  cross_rerank_scores = cross_encoder.predict(cross_inputs)
  df['Reranked_scores'] = cross_rerank_scores
  return df

In [59]:
def get_topn(n, df):
  top_3_rerank = df.sort_values(by='Reranked_scores', ascending=False)
  return top_3_rerank[["Documents", "Metadatas"]][:n]

Query 1

In [60]:
query = 'what if i fail to pay premium?'
df = search(query)
df = apply_cross_encoder(query, df)
df = get_topn(3, df)
response = generate_response(query, df)
print("\n".join(response))

If you fail to pay your insurance premium, your coverage may be at risk. Most policies provide a **grace period**—typically around 30 days—during which you can make the overdue payment and keep your coverage active. If the premium remains unpaid after this grace period, your policy may lapse or terminate, and you will lose your insurance benefits. Payment of benefits is subject to the terms and conditions outlined in your policy, and coverage will not continue if premiums are not paid within the required timeframe. If your dependent’s life insurance terminates due to non-payment, coverage for dependents will also end. To avoid losing coverage, review your policy for specific grace period details and payment requirements[1][3].

**Citations:**
- Policy Document, Page 49, Chunk 4: Payment of benefits will be subject to the Benefit provisions and all other terms and conditions of the policy, including timely payment of premiums[1].
- Policy Document, Page 44, Chunk 4: Dependent's Life Ins

Query 2

In [None]:
query = 'what is the proof needed for showcasing ADL disability?'
df = search(query)
df = apply_cross_encoder(query, df)
df = get_topn(3, df)
response = generate_response(query, df)
print("\n".join(response))

Query 3

In [None]:
query = 'does this cover death due to not wearing seat belt?'
df = search(query)
df = apply_cross_encoder(query, df)
df = get_topn(3, df)
response = generate_response(query, df)
print("\n".join(response))