In [1]:
from llama_index import SimpleDirectoryReader
from llama_index.schema import Document
from llama_index.langchain_helpers.text_splitter import SentenceSplitter

In [2]:
import re

# Pre-compile regex patterns
PAGE_NUM_PATTERN = re.compile(r"Page \d+ of \d+")
HYPHEN_WORD_PATTERN = re.compile(r"(\w+)-(\w+)")
PERIOD_COLON_PATTERN = re.compile(r"([.:])([^ \n])")
CLOSE_PAREN_PATTERN = re.compile(r"(\))(?=[^\s])")
OPEN_PAREN_PATTERN = re.compile(r"(?<=[^\s])(\()")
WHITESPACE_PATTERN = re.compile(r"[ \t]+")


def remove_page_numbers(text):
    return re.sub(r"Page \d+ of \d+", "", text)


def remove_multiple_newlines(page_md):
    page_md = re.sub(r"\n\s*\n", "\n\n", page_md)
    return page_md


def fix_page(text):
    """
    Fixes common text problems in the given text.

    Args:
        text (str): The text to fix.

    Returns:
        str: The fixed text.
    """
    watermarks = [
        "SAMPLE",
        "HO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999",
        "HOMEOWNERS 3 – SPECIAL FORM",
        "HOMEOWNERS",
        "HO 00 03 10 00",
        "Copyright, Insurance Services Office, Inc., 1999",
    ]

    for watermark in watermarks:
        text = text.replace(watermark, "")

    text = remove_page_numbers(text)

    text = re.sub(HYPHEN_WORD_PATTERN, r"\1\2", text)
    text = re.sub(PERIOD_COLON_PATTERN, r"\1 \2", text)
    text = re.sub(CLOSE_PAREN_PATTERN, r"\1 ", text)
    text = re.sub(OPEN_PAREN_PATTERN, r" \1", text)
    text = re.sub(WHITESPACE_PATTERN, " ", text).strip()

    return text

In [3]:
documents_ho3 = SimpleDirectoryReader(input_files=['./data//HO3_sample.pdf']).load_data()

In [4]:
len(documents_ho3)

22

In [5]:
print(documents_ho3[0].text)

HOMEOWNERS
HO 00 03 10 00
HO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A.In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.
B.In addition, certain words and phrases are definedas follows:
1."Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a.Liability for "bodily injury" or "property dam-age" arising out of the:
(1)Ownership of such vehicle or craft by an"insured";
(2)Maintenance, occupancy, operation,use, loading or unloading of such vehi-cle or craft by any person;
(3)Entrustment of such vehicle or craft b

In [6]:
for i, _ in enumerate(documents_ho3):
    documents_ho3[i].text = fix_page(documents_ho3[i].text)

In [7]:
print(documents_ho3[0].metadata)

{'page_label': '1', 'file_name': 'HO3_sample.pdf'}


In [8]:
long_string = "".join(documents_ho3[i].text for i in range(len(documents_ho3)))
print(long_string[:2000])

AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A. In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this insurance.
B. In addition, certain words and phrases are definedas follows:
1. "Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a. Liability for "bodily injury" or "property damage" arising out of the:
(1) Ownership of such vehicle or craft by an"insured";
(2) Maintenance, occupancy, operation,use, loading or unloading of such vehicle or craft by any person;
(3) Entrustment of such vehicle or craft byan "insured" to any person;
(4) Failure to supervise or negligent supervision of any person involving such vehicle or craft b

In [9]:
ho3_document = [Document(text=long_string,
                        id_="HO3_sample.pdf",
                        metadata={"Document Name": "HO3 Sample Policy", 
                                  "Category": "Homeowner's Insurance"})]

In [10]:
ho3_document[0].get_content()

'AGREEMENT\nWe will provide the insurance described in this policy\nin return for the premium and compliance with allapplicable provisions of this policy.\nDEFINITIONS\nA. In this policy, "you" and "your" refer to the "named\ninsured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this insurance.\nB. In addition, certain words and phrases are definedas follows:\n1. "Aircraft Liability", "Hovercraft Liability", "Motor\nVehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the\nfollowing:\na. Liability for "bodily injury" or "property damage" arising out of the:\n(1) Ownership of such vehicle or craft by an"insured";\n(2) Maintenance, occupancy, operation,use, loading or unloading of such vehicle or craft by any person;\n(3) Entrustment of such vehicle or craft byan "insured" to any person;\n(4) Failure to supervise or negligent supervision of any person involving such veh

In [11]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    KeywordExtractor,
    MetadataFeatureExtractor,
)

text_splitter = SentenceSplitter(
  chunk_size=512,
  chunk_overlap=5,
)


In [12]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    KeywordExtractor,
)


metadata_extractor = MetadataExtractor(
    extractors=[
        SummaryExtractor(summaries=["self"]),
        KeywordExtractor(keywords=5),
    ],
)

node_parser = SimpleNodeParser(
    text_splitter=text_splitter,
    metadata_extractor=metadata_extractor,
)

In [13]:
ho3_nodes = node_parser.get_nodes_from_documents(ho3_document)

In [14]:
len(ho3_nodes)

39

In [15]:
def identify_original_documents(documents, chunks):
    """
    This function identifies the original document(s) each chunk came from.
    Args:
    documents: A list of original documents.
    chunks: A list of text chunks.
    Returns:
    chunks: The provided chunks with appended page info.
    """
    # Combine all documents into a single string and track their ranges.
    long_string = ""
    pages_dict = {}
    for i, doc in enumerate(documents):
        start = len(long_string)
        long_string += doc.text
        end = len(long_string)
        pages_dict[i] = (start, end)

    # Identify the original document(s) for each chunk.
    for i, chunk in enumerate(chunks):
        chunk_start = long_string.find(chunk.text)
        chunk_end = chunk_start + len(chunk.text)
        pages = []
        for doc_number, (doc_start, doc_end) in pages_dict.items():
            if (chunk_start >= doc_start and chunk_start < doc_end) or (chunk_end > doc_start and chunk_end <= doc_end):
                pages.append(doc_number + 1)
        # Update existing metadata dictionary
        chunk.metadata.update({'pages': ', '.join(map(str, pages))})

    return chunks


In [16]:
# Running the function
# replace "documents_ho3" and "chunks_ho3" with your actual variables
test = identify_original_documents(documents_ho3, ho3_nodes)

In [17]:
from llama_index.schema import MetadataMode

print(
    "LLM sees:\n",
    test[0].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
Document Name: HO3 Sample Policy
Category: Homeowner's Insurance
section_summary: 
This section outlines the agreement between the company providing the insurance and the named insured, as well as defining certain words and phrases used in the policy. The agreement states that the company will provide the insurance in return for the premium and compliance with the policy. The definitions include Aircraft Liability, Hovercraft Liability, Motor Vehicle Liability, and Watercraft Liability, as well as Bodily Injury, Business, and Motor Vehicle.
excerpt_keywords:  Insurance, Aircraft Liability, Hovercraft Liability, Motor Vehicle Liability, Watercraft Liability
pages: 1
Excerpt:
-----
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A. In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a re

In [32]:
print(
    "Embeddings model sees:\n",
    test[0].get_content(metadata_mode=MetadataMode.EMBED)),

Embeddings model sees:
 [Excerpt from document]
Document Name: HO3 Sample Policy
Category: Homeowner's Insurance
section_summary: 
This section outlines the agreement between the company providing the insurance and the named insured, as well as defining certain words and phrases used in the policy. The agreement states that the company will provide the insurance in return for the premium and compliance with the policy. The definitions include Aircraft Liability, Hovercraft Liability, Motor Vehicle Liability, and Watercraft Liability, as well as Bodily Injury, Business, and Motor Vehicle.
excerpt_keywords:  Insurance, Aircraft Liability, Hovercraft Liability, Motor Vehicle Liability, Watercraft Liability
pages: 1
Excerpt:
-----
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A. In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the 

(None,)

In [18]:
print(test[3].get_metadata_str())

Document Name: HO3 Sample Policy
Category: Homeowner's Insurance
section_summary: 
This section outlines the coverage provided by the policy for the dwelling, other structures, and personal property. It specifies the limits of liability for each coverage, and outlines what is and is not covered. It also outlines the deductible provision that applies to all losses.
excerpt_keywords: 
Dwelling, Other Structures, Residence Premises, Deductible, Personal Property
pages: 2, 3


In [19]:
print(test[3].get_content())

[Excerpt from document]

Excerpt:
-----
The one family dwelling where you reside;
b. The two, three or four family dwelling whereyou reside in at least one of the family units;or
c. That part of any other building where youreside;
and which is shown as the "residence premises" in the Declarations.
"Residence premises" also includes other
structures and grounds at that location.DEDUCTIBLE
Unless otherwise noted in this policy, the following
deductible provision applies:
Subject to the policy limits that apply, we will pay only
that part of the total of all loss payable under Section I
that exceeds the deductible amount shown in theDeclarations.
SECTION I – PROPERTY COVERAGES
A. Coverage A – Dwelling
1. We cover:
a. The dwelling on the "residence premises"
shown in the Declarations, including structures attached to the dwelling; and
b. Materials and supplies located on or next tothe "residence premises" used to construct,alter or repair the dwelling or other structures on the "residence 

In [20]:
len(test), len(ho3_nodes)

(39, 39)

In [21]:
print(test[13].get_content())

[Excerpt from document]

Excerpt:
-----
We do not cover:
(1) The loss in value to any covered building or other structure due to the requirements of any ordinance or law; or
(2) The costs to comply with any ordinanceor law which requires any "insured" orothers to test for, monitor, clean up, remove, contain, treat, detoxify or neutralize, or in any way respond to, or assess the effects of, pollutants in or onany covered building or other structure.
Pollutants means any solid, liquid,gaseous or thermal irritant or contaminant, including smoke, vapor, soot,fumes, acids, alkalis, chemicals andwaste. Waste includes materials to berecycled, reconditioned or reclaimed.
This coverage is additional insurance.
12. Grave Markers
We will pay up to $5,000 for grave markers, including mausoleums, on or away from the"residence premises" for loss caused by a PerilInsured Against under Coverage C.
This coverage does not increase the limits ofliability that apply to the damaged coveredproperty.
SECTION

In [22]:
print(test[13].get_metadata_str())

Document Name: HO3 Sample Policy
Category: Homeowner's Insurance
section_summary: 
This section outlines the coverage provided by the insurance policy for dwelling and other structures, including grave markers. It states that the policy covers direct physical loss to property, but excludes losses caused by collapse, freezing of plumbing, heating, air conditioning, or automatic fire protective sprinkler systems, or household appliances, and losses due to excluded perils. It also states that the policy will pay up to $5,000 for grave markers.
excerpt_keywords:  insurance, ordinance, law, pollutants, grave markers
pages: 8, 9


In [23]:
print(documents_ho3[8].get_content())

(b) Shut off the water supply and drain
all systems and appliances of water.
However, if the building is protected by
an automatic fire protective sprinklersystem, you must use reasonable careto continue the water supply and maintain heat in the building for coverage toapply.
For purposes of this provision a plumb-
ing system or household appliance doesnot include a sump, sump pump or related equipment or a roof drain, gutter,downspout or similar fixtures or equipment;
(2) Freezing, thawing, pressure or weight ofwater or ice, whether driven by wind ornot, to a:
(a) Fence, pavement, patio or swimmingpool;
(b) Footing, foundation, bulkhead, wall,or any other structure or device thatsupports all or part of a building, orother structure;
(c) Retaining wall or bulkhead that doesnot support all or part of a building orother structure; or
(d) Pier, wharf or dock;
(3) Theft in or to a dwelling under construction, or of materials and supplies for usein the construction until the dwelling isfini

In [24]:
print(documents_ho3[9].get_content())

(ii) Plumbing, heating, air conditioning or
automatic fire protective sprinkler sys-
tem or household appliance on the"residence premises". This includes thecost to tear out and replace any part of abuilding, or other structure, on the "residence premises", but only when necessary to repair the system or appliance. However, such tear out and replacement coverage only applies to otherstructures if the water or steam causesactual damage to a building on the"residence premises".
We do not cover loss to the system or appliance from which this water or steam escaped.
For purposes of this provision, a plumbing
system or household appliance does notinclude a sump, sump pump or relatedequipment or a roof drain, gutter, downspout or similar fixtures or equipment.
Section I – Exclusion A. 3. Water Damage,
Paragraphs a. and c. that apply to surface
water and water below the surface of theground do not apply to loss by water coveredunder c. (5) and (6) above.
Under 2. b. and c. above, any ensuing 

In [25]:
from llama_index import VectorStoreIndex, StorageContext

storage_context = StorageContext.from_defaults()

index = VectorStoreIndex(
    ho3_nodes,
    storage_context=storage_context)

In [26]:
index.set_index_id = "ho3_sample_metadatas"
storage_context.persist(persist_dir="./ho3_metadatas_index")

In [27]:
qe = index.as_query_engine()

In [28]:
res = qe.query("What are the conditions for water damage to be covered by the policy?")

In [29]:
from IPython.display import Markdown

In [30]:
Markdown(f'{res}')


Water damage is covered by the policy if it is caused by an accidental discharge or overflow of water or steam from within a storm drain, plumbing, heating, air conditioning or automatic fire protective sprinkler system or household appliance on the "residence premises". Additionally, any ensuing loss to property described in Coverages A and B not precluded by any other provision in the policy is covered.

In [31]:
print(res.source_nodes[0].node.get_metadata_str())

Document Name: HO3 Sample Policy
Category: Homeowner's Insurance
section_summary: 
This section outlines the exclusions from coverage under the policy, including earth movement, water damage, power failure, neglect, and war. It also explains the circumstances under which direct loss by fire, explosion, or theft resulting from water damage is covered.
excerpt_keywords: 
Insurance, Exclusion, Earthquake, Flood, Power Failure
pages: 11, 12
