In [3]:
from llama_index import SimpleDirectoryReader

In [4]:
documents_ho3 = SimpleDirectoryReader(input_files=['./data/policy_docs/HO3_sample.pdf']).load_data()

In [6]:
documents_ho3[0].text

'HOMEOWNERS\nHO 00 03 10 00\nHO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM\nAGREEMENT\nWe will provide the insurance described in this policy\nin return for the premium and compliance with allapplicable provisions of this policy.\nDEFINITIONS\nA.In this policy, "you" and "your" refer to the "named\ninsured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.\nB.In addition, certain words and phrases are definedas follows:\n1."Aircraft Liability", "Hovercraft Liability", "Motor\nVehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the\nfollowing:\na.Liability for "bodily injury" or "property dam-age" arising out of the:\n(1)Ownership of such vehicle or craft by an"insured";\n(2)Maintenance, occupancy, operation,use, loading or unloading of such vehi-cle or craft by any person;\n(3)Entrustment of such v

In [7]:
long_string = "".join(documents_ho3[i].text for i in range(len(documents_ho3)))

In [8]:
print(long_string)

HOMEOWNERS
HO 00 03 10 00
HO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A.In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.
B.In addition, certain words and phrases are definedas follows:
1."Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a.Liability for "bodily injury" or "property dam-age" arising out of the:
(1)Ownership of such vehicle or craft by an"insured";
(2)Maintenance, occupancy, operation,use, loading or unloading of such vehi-cle or craft by any person;
(3)Entrustment of such vehicle or craft b

In [24]:
import re
from collections import deque

class Node:
    def __init__(self, text):
        self.text = text
        self.children = []

    def add_child(self, node):
        self.children.append(node)

    def get_text(self):
        return self.text

    def get_children(self):
        return self.children

# Regular expressions to match section headers
regex_patterns = [
    (re.compile(r'^(?P<title>[A-Z\s\d–]+)$', re.MULTILINE), Node),
    (re.compile(r'^(?P<section>[A-Z]\.)', re.MULTILINE), Node),
    (re.compile(r'^(?P<subsection>\d+\.)', re.MULTILINE), Node),
    (re.compile(r'^(?P<subpoint>[a-z]\.)', re.MULTILINE), Node),
    (re.compile(r'^(?P<point>\(\d+\))', re.MULTILINE), Node),
    (re.compile(r'^(?P<subpoint>\([a-z]\))', re.MULTILINE), Node),
]

def build_tree(text):
    lines = text.split('\n')
    root = Node("")
    stack = [root]

    for line in lines:
        for pattern, NodeClass in regex_patterns:
            match = pattern.match(line)
            if match:
                node = NodeClass(line)
                stack[-1].add_child(node)
                stack.append(node)
                break
        else:
            if stack:
                stack[-1].text += '\n' + line

    return root

def extract_sections(root, char_limit):
    stack = deque([(root, '')])
    while stack:
        node, path = stack.pop()
        new_path = path + '\n' + node.get_text() if path else node.get_text()
        if len(new_path) > char_limit:
            return path
        if node.get_children():
            stack.extend((child, new_path) for child in node.get_children())
        else:
            return new_path

In [25]:
def extract_sections_complete(root, char_limit, start=0):
    stack = deque([(root, '', 0, start)])
    while stack:
        node, path, length, counter = stack.pop()
        new_text = '\n' + node.get_text() if path else node.get_text()
        new_length = length + len(new_text)
        if new_length > char_limit:
            return path, counter
        new_path = path + new_text
        counter += 1
        if node.get_children():
            stack.extend((child, new_path, new_length, counter) for child in node.get_children())
        else:
            return new_path, counter

def split_into_chunks(root, char_limit):
    chunks = []
    start = 0
    while True:
        chunk, start = extract_sections_complete(root, char_limit, start)
        chunks.append(chunk)
        if start >= len(root.get_children()):
            break
    return chunks


In [26]:
# Build the tree from the document text
tree = build_tree(long_string)

# Split the tree into chunks
chunks = split_into_chunks(tree, 1024)


In [27]:
print(chunks[0])

HOMEOWNERS
HO 00 03 10 00
HO 00 03 10 00 Copyright, Insurance Services Office, Inc., 1999 Page 1 of 22HOMEOWNERS 3 – SPECIAL FORM
AGREEMENT
We will provide the insurance described in this policy
in return for the premium and compliance with allapplicable provisions of this policy.
DEFINITIONS
A.In this policy, "you" and "your" refer to the "named
insured" shown in the Declarations and the spouseif a resident of the same household. "We", "us"and "our" refer to the Company providing this in-surance.
B.In addition, certain words and phrases are definedas follows:
1."Aircraft Liability", "Hovercraft Liability", "Motor
Vehicle Liability" and "Watercraft Liability",subject to the provisions in b. below, mean the
following:
a.Liability for "bodily injury" or "property dam-age" arising out of the:
(1)Ownership of such vehicle or craft by an"insured";
(2)Maintenance, occupancy, operation,use, loading or unloading of such vehi-cle or craft by any person;


In [19]:
class Node:
    def __init__(self, text):
        self.text = text
        self.children = []
        self.visited = False

    def add_child(self, node):
        self.children.append(node)

    def get_text(self):
        return self.text

    def get_children(self):
        return self.children

    def is_visited(self):
        return self.visited

    def mark_visited(self):
        self.visited = True

def extract_sections_complete(root, char_limit):
    stack = deque([(root, '', 0)])
    chunk = ''
    while stack:
        node, path, length = stack.pop()
        if not node.is_visited():
            new_text = '\n' + node.get_text() if path else node.get_text()
            new_length = length + len(new_text)
            if new_length > char_limit:
                return chunk
            new_path = path + new_text
            node.mark_visited()
            if node.get_children():
                stack.extend((child, new_path, new_length) for child in node.get_children())
            else:
                chunk = new_path
    return chunk

def split_into_chunks(root, char_limit):
    chunks = []
    while True:
        chunk = extract_sections_complete(root, char_limit)
        if chunk:
            chunks.append(chunk)
        else:
            break
    return chunks


In [28]:
# Build the tree from the document text
# tree = build_tree(long_string)

# Split the tree into chunks
chunks = split_into_chunks(tree, 1024)

In [31]:
chunks[1]

IndexError: list index out of range