In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

In [2]:
import logging
import sys
from IPython.display import Markdown
from pathlib import Path
import PyPDF2

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
import re
import pypdf
import spacy

def pdf_to_pages(file):
    "extract text (pages) from pdf file"
    pdf = pypdf.PdfReader(file)
    pages = [page.extract_text() for page in pdf.pages]
    return pages

# helpers
def remove_page_numbers(text):
    return re.sub(r'Page \d+ of \d+', '', text)


def find_eos_spacy(text):
    nlp = spacy.load('en_core_web_lg')
    doc = nlp(text)
    return [sent.end_char for sent in doc.sents]

def fix_text_problems(text):
    # Strip footer and watermark
    watermarks = [
        "SAMPLE", 
        "HO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999", 
        "HOMEOWNERS 3 – SPECIAL FORM", 
        "HOMEOWNERS", 
        "HO 00 03 10 00", 
        "Copyright, Insurance Services Office, Inc., 1999",
    ]

    for watermark in watermarks:
        text = text.replace(watermark, '')

    text = remove_page_numbers(text)

    # Merge hyphenated words
    text = re.sub(r'(\w+)-(\w+)', r'\1\2', text)
    # Add a space after a period or a colon if it's not there yet
    text = re.sub(r'([.:])([^ \n])', r'\1 \2', text)
    # Add a space after a closing parenthesis if it's not there yet
    text = re.sub(r'(\))(?=[^\s])', r'\1 ', text)
    # Add a space before an opening parenthesis if it's not there yet
    text = re.sub(r'(?<=[^\s])(\()', r' \1', text)
    # Normalize whitespaces, but keep newlines
    text = re.sub(r'[ \t]+', ' ', text).strip()

    return text


In [4]:
def split_pages_into_fragments(pages, frag_size):
	"split pages (list of texts) into smaller fragments (list of texts)"
	num_pages = len(pages)
	page_offset = [0]
	for p, page in enumerate(pages):
		page_offset += [page_offset[-1]+len(page)+1]

	if frag_size:
		text = ' '.join(pages)
		return text_to_fragments(text, frag_size, page_offset, num_pages)
	else:
		return pages

def text_to_fragments(text, size, page_offset, num_pages):
	"split single text into smaller fragments (list of texts)"
	if size and len(text)>size:
		out = []
		pos = 0
		page = 1
		p_off = page_offset.copy()[1:]
		eos = find_eos_spacy(text)
		if len(text) not in eos:
			eos += [len(text)+1]
		for i in range(len(eos)):
			if eos[i]-pos>size:
				text_fragment = f'PAGE {page} of {num_pages}: \n\n'+text[pos:eos[i]]
				out += [text_fragment]
				pos = eos[i]
				if eos[i]>p_off[0]:
					page += 1
					del p_off[0]
		text_fragment = f'PAGE {page} of {num_pages}: \n\n'+text[pos:eos[i]]
		out += [text_fragment]
		out = [x for x in out if x]
		return out
	else:
		return [text]

In [5]:
def build_document_dict(filepath, form_name, form_id, frag_size=0):

	pages = pdf_to_pages(filepath)
 
	for i in range(len(pages)):
		pages[i] = fix_text_problems(pages[i])
	texts = split_pages_into_fragments(pages, frag_size)
 
	out = {}

	out['frag_size'] = frag_size
	out['n_pages']   = len(pages)
	out['texts']     = texts
	out['pages']     = pages
	out['policy_name']  = form_name
	out['policy_id'] = form_id

	return out

In [6]:
index_dict = build_document_dict(
    filepath="./data/policy_docs/HO3_sample.pdf", 
    form_name="HO3 Policy",
    form_id="HO 00 03 10 00",
    frag_size=1000)

In [9]:
index_dict['n_pages']

22

In [7]:
print(index_dict['texts'][15])

PAGE 5 of 22: 

The $1,000 limit is the most we will pay inany one loss regardless of the number offallen trees. No more than $500 of this limitwill be paid for the removal of any one tree.
This coverage is additional insurance.
2. Reasonable Repairs
a. We will pay the reasonable cost incurred by
you for the necessary measures takensolely to protect covered property that isdamaged by a Peril Insured Against fromfurther damage. b. If the measures taken involve repair to
other damaged property, we will only pay ifthat property is covered under this policyand the damage is caused by a Peril Insured Against. This coverage does not:
(1) Increase the limit of liability that appliesto the covered property; or
(2) Relieve you of your duties, in case of aloss to covered property, described inB. 4. under Section I – Conditions.
3. Trees, Shrubs And Other Plants
We cover trees, shrubs, plants or lawns, on the"residence premises", for loss caused by thefollowing Perils Insured Against:
a. Fire or 

In [46]:
print(index_dict['pages'][0])

AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A. In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this insurance.
B. In addition, certain words and phrases are definedas follows:
1. "Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a. Liability for "bodily injury" or "property damage" arising out of the:
(1) Ownership of such vehicle or craft by an"insured";
(2) Maintenance, occupancy, operation,use, loading or unloading of such vehicle or craft by any person;
(3) Entrustment of such vehicle or craft byan "insured" to any person;
(4) Failure to supervise or negligent supervision of any person involving such vehicle or craft b

In [49]:
from llama_index import Document, VectorStoreIndex
from llama_index.data_structs.node import Node

def create_index_from_dict(document_dict):
    # Create a list of complete pages and custom parsed chunks
    page_list = document_dict['pages']
    node_list = document_dict['texts']
    # Create llama_index Node objects
    nodes = [Node(t) for t in node_list]
    # Add metadata to each node
    for i in range(len(node_list)):
        nodes[i].extra_info = {
            'Node Number': i+1,
            'Policy: ': index_dict['policy_name'],
        }
    # Create index for page level documents
    documents = [Document(t) for t in page_list]
    # Add metadate to pages
    for i in range(len(page_list)):
        documents[i].extra_info = {
            'Page: ': i+1,
            'Policy: ': index_dict['policy_name'],
        }
    # Build index structure
    policy_index = VectorStoreIndex(nodes)
    
    for doc in documents:
        policy_index.insert(doc)
        
    return policy_index

In [50]:
ho3_policy_index = create_index_from_dict(index_dict)

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": ["Node Number: 1 Policy: : HO3 Policy  PAGE 1 of 22:   AGREEMENT We will provide the insurance described in this policy in return for the premium and compliance with allapplicable provisions of this policy. DEFINITIONS A. In this policy, \\"you\\" and \\"your\\" refer to the \\"named insured\\" shown in the Declarations and the spouseif a resident of the same household. \\"We\\", \\"us\\"and \\"our\\" refer to the Company providing this insurance. B. In addition, certain words and phrases are definedas follows: 1. \\"Aircraft Liability\\", \\"Hovercraft Liability\\", \\"Motor Vehicle Liability\\" and \\"Watercraft Liability\\",subject to the provisions in b. below, mean the 

DEBUG:urllib3.connectionpool:https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=49 request_id=ebeff45a68ca4058c920614cf863a3f7 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=49 request_id=ebeff45a68ca4058c920614cf863a3f7 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=49 request_id=ebeff45a68ca4058c920614cf863a3f7 response_code=200
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None dat

In [60]:
ho3_policy_index.index_id

'ho3_policy_document_index'

In [59]:
ho3_policy_index.set_index_id("ho3_policy_document_index")

In [61]:
ho3_policy_index.storage_context.persist(persist_dir="./ho3_custom_parse_index")

DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/docstore.json
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/index_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/index_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/index_store.json
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/vector_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/vector_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/ho3_custom_parse_index/vector_store.json
DEBUG:fsspec.local:open file: c

In [15]:
page_list = index_dict['pages']
node_list = index_dict['texts']

In [16]:
len(page_list), len(node_list)

(22, 69)

In [17]:
nodes = [Node(t) for t in node_list]

In [18]:
for i in range(len(node_list)):
    nodes[i].extra_info = {
        'Policy': index_dict['policy_type'],
    }

In [20]:
documents = [Document(t) for t in page_list]

In [21]:
for i in range(len(page_list)):
    documents[i].extra_info = {
        'Page': i+1,
        'Policy': index_dict['policy_type'],
    }

In [26]:
print(nodes[0].text)

PAGE 1 of 22: 

AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A. In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this insurance.
B. In addition, certain words and phrases are definedas follows:
1. "Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a. Liability for "bodily injury" or "property damage" arising out of the:
(1) Ownership of such vehicle or craft by an"insured";
(2) Maintenance, occupancy, operation,use, loading or unloading of such vehicle or craft by any person;
(3) Entrustment of such vehicle or craft byan "insured" to any person;
(4) Failure to supervise or negligent supervision of any person involving such ve

In [23]:
print(f"{documents[3].extra_info}")

{'Page': 4, 'Policy': 'HOMEOWNERS 3 - SPECIAL FORM'}


In [27]:
print(nodes[1].text)

PAGE 1 of 22: 

b. For the purpose of this definition:
(1) Aircraft means any contrivance used ordesigned for flight except model orhobby aircraft not used or designed tocarry people or cargo;
(2) Hovercraft means a selfpropelled motorized ground effect vehicle and includes, but is not limited to, flarecraftand air cushion vehicles;
(3) Watercraft means a craft principallydesigned to be propelled on or in waterby wind, engine power or electric motor;and
(4) Motor vehicle means a "motor vehicle"as defined in 7. below. 2. "Bodily injury" means bodily harm, sickness ordisease, including required care, loss of services and death that results.
3. "Business" means:
a. A trade, profession or occupation engagedin on a fulltime, parttime or occasional basis; or
b. Any other activity engaged in for money orother compensation, except the following:
(1) One or more activities, not described in (2) through (4) below, for which no "in-
sured" receives more than $2,000 intotal compensation for the 12

In [28]:
from llama_index import VectorStoreIndex

ho3_index = VectorStoreIndex(nodes)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /gpt2/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /gpt2/resolve/main/tokenizer_config.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /gpt2/resolve/main/vocab.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /gpt2/resolve/main/vocab.json HTTP/1.1" 200 0
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": ["Node: 1 of 69 Policy: HOMEOWNERS 3 - SPECIAL FORM  PAGE 1 of 22:   AGREEMENT We will provide the insurance described in this policy in return for the premium and compliance with allapplicable provisions of this policy. DEFINITIONS A. 

In [29]:
for doc in documents:
    ho3_index.insert(doc)

DEBUG:llama_index.node_parser.node_utils:> Adding chunk: AGREEMENT
We will provide the insurance describ...
> Adding chunk: AGREEMENT
We will provide the insurance describ...
DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": ["Page: 1 Policy: HOMEOWNERS 3 - SPECIAL FORM  AGREEMENT We will provide the insurance described in this policy in return for the premium and compliance with allapplicable provisions of this policy. DEFINITIONS A. In this policy, \\"you\\" and \\"your\\" refer to the \\"named insured\\" shown in the Declarations and the spouseif a resident of the same household. \\"We\\", \\"us\\"and \\"our\\" refer to the Company providing this insurance. B. In addition, certain words and phrases are definedas follows: 1. \\"Aircraft Liability\\", \\"Hovercraft Liability\\", \\"Motor Vehicle Liability\\

In [30]:
from llama_index import LLMPredictor, VectorStoreIndex, ServiceContext

llm = get_llm(model_temperature=0)

llm_predictor = LLMPredictor(llm=llm)

service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [31]:
from llama_index import set_global_service_context
set_global_service_context(service_context)

In [32]:
ho3_index.storage_context.persist(persist_dir="./storage_v2/")

DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/docstore.json
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/index_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/index_store.json
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/vector_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/vector_store.json
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/graph_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/graph_store.json


In [26]:
from llama_index import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage_v2")

ho3_index_ = load_index_from_storage(storage_context)

DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ./storage_v2\docstore.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./storage_v2\docstore.json.
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/docstore.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/docstore.json
DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ./storage_v2\index_store.json.
Loading llama_index.storage.kvstore.simple_kvstore from ./storage_v2\index_store.json.
DEBUG:fsspec.local:open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/index_store.json
open file: c:/Users/pdoub/Desktop/python_projects/liberty-gpt/storage_v2/index_store.json
DEBUG:llama_index.vector_stores.simple:Loading llama_index.vector_stores.simple from ./storage_v2\vector_store.json.
Loading llama_index.vector_stores.simple from ./sto

In [27]:
ho3_index_.index_id

'e6fae720-b9d6-4ba6-9ed4-8cc552ddf986'

In [4]:
query_engine = ho3_index_.as_query_engine(
    similarity_top_k=3,
    response_mode="tree_summarize",
)
response = query_engine.query(
    "What are the conditions for water damage to be covered under the policy?",
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1468 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
Markdown(f"{response}")


Under the policy, water damage is covered if it is caused by any of the following perils: Fire or Lightning, Windstorm or Hail, Explosion, Riot or Civil Commotion, Aircraft, Vehicles, Smoke, Vandalism or Malicious Mischief, Theft, Falling Objects, and Weight of Ice, Snow or Sleet. Additionally, water damage resulting from an accidental discharge or overflow of water or steam from within a storm drain, water, steam, or sewer pipe off the "residence premises" or from a plumbing, heating, air conditioning, or automatic fire protective sprinkler system or household appliance on the "residence premises" is also covered.

In [21]:
response.source_nodes[0].node.text

'PAGE 10 of 22: \n\nWe do not cover loss to the system or appliance from which this water or steam escaped.\nFor purposes of this provision, a plumbing\nsystem or household appliance does notinclude a sump, sump pump or relatedequipment or a roof drain, gutter, downspout or similar fixtures or equipment.\nSection I – Exclusion A. 3. Water Damage,\nParagraphs a. and c. that apply to surface\nwater and water below the surface of theground do not apply to loss by water coveredunder c. (5) and (6) above.\nUnder 2. b. and c. above, any ensuing loss to\nproperty described in Coverages A and B not\nprecluded by any other provision in this policyis covered.\nB. Coverage C – Personal Property\nWe insure for direct physical loss to the propertydescribed in Coverage C caused by any of the\nfollowing perils unless the loss is excluded in Section I – Exclusions.\n1. Fire Or Lightning\n2. Windstorm Or Hail\nThis peril includes loss to watercraft of all types\nand their trailers, furnishings, equipme

In [25]:
Markdown(f"{response.source_nodes[0].node.text}")

PAGE 10 of 22: 

We do not cover loss to the system or appliance from which this water or steam escaped.
For purposes of this provision, a plumbing
system or household appliance does notinclude a sump, sump pump or relatedequipment or a roof drain, gutter, downspout or similar fixtures or equipment.
Section I – Exclusion A. 3. Water Damage,
Paragraphs a. and c. that apply to surface
water and water below the surface of theground do not apply to loss by water coveredunder c. (5) and (6) above.
Under 2. b. and c. above, any ensuing loss to
property described in Coverages A and B not
precluded by any other provision in this policyis covered.
B. Coverage C – Personal Property
We insure for direct physical loss to the propertydescribed in Coverage C caused by any of the
following perils unless the loss is excluded in Section I – Exclusions.
1. Fire Or Lightning
2. Windstorm Or Hail
This peril includes loss to watercraft of all types
and their trailers, furnishings, equipment, and
outboard engines or motors, only while inside afully enclosed building.


In [10]:
Markdown(f"{response.source_nodes[0].node.get_text()}")

Node: 28 of 69
Policy: HOMEOWNERS 3 - SPECIAL FORM

PAGE 10 of 22: 

We do not cover loss to the system or appliance from which this water or steam escaped.
For purposes of this provision, a plumbing
system or household appliance does notinclude a sump, sump pump or relatedequipment or a roof drain, gutter, downspout or similar fixtures or equipment.
Section I – Exclusion A. 3. Water Damage,
Paragraphs a. and c. that apply to surface
water and water below the surface of theground do not apply to loss by water coveredunder c. (5) and (6) above.
Under 2. b. and c. above, any ensuing loss to
property described in Coverages A and B not
precluded by any other provision in this policyis covered.
B. Coverage C – Personal Property
We insure for direct physical loss to the propertydescribed in Coverage C caused by any of the
following perils unless the loss is excluded in Section I – Exclusions.
1. Fire Or Lightning
2. Windstorm Or Hail
This peril includes loss to watercraft of all types
and their trailers, furnishings, equipment, and
outboard engines or motors, only while inside afully enclosed building.
