In [1]:
import pdfplumber
import re
import json
from utils import is_toc_page

In [None]:
path = ''

In [60]:
raw_text = ''
with pdfplumber.open(path) as pdf:
    for i, page in enumerate(pdf.pages[:97]):
        text = page.extract_text()
        if not is_toc_page(text):
            raw_text += f"\n<page>{i}</page>\n" + text

In [None]:
print(raw_text)

In [108]:
def extract_clauses_from_text(text):
    # Patterns
    page_number_pattern = re.compile(r"<page>(\d+)</page>")
    title_pattern = re.compile(r"^(\d{1,2})\.\s+(.+)$")
    subtitle_pattern = re.compile(r"^(\d{1,2}(?:\.\d{1,2})?)\s+([A-Z][^\n]+)$")
    subclause_pattern = re.compile(r"^\s*\(([a-zA-Z]|[ivxlcdm]+|\d+)\)\s+(.*?)(?=\n\s*\([a-zA-Zivxlcdm\d]+\)\s+|\Z)", re.IGNORECASE)
    definition_pattern = re.compile(r'^"([^"]+)"\s+means:?\s*$')

    current_page_number = 0
    current_title = None
    current_definition = None
    subclauses = []
    definitions = []
    subtitles = []
    clauses = [{
                "number": "",
                "title": "Cover page and Borrowers",
                "full_title": "",
                "text": "",
                "subtitles": [{
                    'number': "",
                    'subtitle':"no_subtitle",
                    'full_subtitle': "",
                    'subclauses': subclauses,
                    'text': '',
                    'definitions' : {},
                    "page_number": current_page_number,
                }],
                "page_number": current_page_number,
            }]

    # Split text into lines
    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Matches
        page_number_match = page_number_pattern.match(line)
        title_match = title_pattern.match(line)
        subtitle_match = subtitle_pattern.match(line)
        subclause_match = subclause_pattern.match(line)
        definition_match = definition_pattern.match(line)


        if page_number_match:
            current_page_number = page_number_match.group(1)
            continue

        if title_match:
            subtitles = []
            subclauses = []
            current_definition = None
            current_title = {
                "number": title_match.group(1),
                "title": title_match.group(2).strip(),
                "full_title": line,
                "text": "",
                "subtitles": subtitles,
                "page_number": current_page_number,
            }
            clauses.append(current_title)

        elif subtitle_match and current_title:
            subclauses = []
            current_definition = None
            subtitles.append({
                "number": subtitle_match.group(1),
                "subtitle": subtitle_match.group(2),
                "full_subtitle": line,
                "subclauses": subclauses,
                "text": "",
                "definitions" : {},
                "page_number": current_page_number,
            })
            clauses[-1]['subtitles'] = subtitles

        elif subclause_match and clauses[-1]['subtitles']:
            subclause = {
                "label": subclause_match.group(1),
                "text": subclause_match.group(2).strip(),
                "full_text": line,
                "page_number": current_page_number
            }

            if current_definition:
                subclause["rank"] = len(definitions)
                definitions.append(subclause)
                clauses[-1]['subtitles'][-1]['definitions'][current_definition] = definitions
            else:
                subclause["rank"] = len(subclauses)
                subclauses.append(subclause)
                clauses[-1]['subtitles'][-1]['subclauses'] = subclauses

        elif definition_match and clauses[-1]['subtitles']:
            current_definition = line
            definitions = []
            clauses[-1]['subtitles'][-1]['definitions'][current_definition] = []

        elif subclauses:
            clauses[-1]['subtitles'][-1]['subclauses'][-1]['text'] += "\n" + line

        elif definitions and clauses[-1]['subtitles'] and current_definition:
            clauses[-1]['subtitles'][-1]['definitions'][current_definition][-1]['text'] += "\n" + line

        elif subtitles:
            clauses[-1]['subtitles'][-1]['text'] += "\n" + line
        else:
            clauses[-1]['text'] += "\n" + line

    return clauses

In [109]:
structured_data = {"clauses": extract_clauses_from_text(raw_text)}
output_json_path = "output_clauses_v2.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"✅ JSON saved to: {output_json_path}")



✅ JSON saved to: output_clauses_v2.json


In [110]:
# Load json file

with open(output_json_path, 'r') as f:
    data = json.load(f)

In [111]:
for index, clause in enumerate(data['clauses']):
    print(index, clause.keys())

0 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
1 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
2 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
3 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
4 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
5 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
6 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
7 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
8 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
9 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
10 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
11 dict_keys(['number', 'title', 'full_title', 'text', 'subtitles', 'page_number'])
12

In [112]:
for subtitle in data['clauses'][31]['subtitles']:
    print(subtitle.keys())

dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])
dict_keys(['number', 'subtitle', 'full_subtitle', 'subclauses', 'text', 'definitions', 'page_number'])


In [113]:
# Store data into NEO4J database
from utils import get_database_driver

driver = get_database_driver()

In [114]:
delete_query = "MATCH (n) DETACH DELETE n"
driver.execute_query(delete_query)

EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x3933a9990>, keys=[])

In [115]:
def create_graph(tx, clause):
    tx.run("""
        MERGE (t:Title {label: $full_title})
        SET t.number = $clause_number
        SET t.title = $clause_title
        SET t.text = $text
        SET t.page_number = $page_number

        WITH t
        UNWIND $subtitles AS subtitle
            MERGE (s:Subtitle {label: subtitle.full_subtitle})
            SET s.number = subtitle.number
            SET s.subtitle = subtitle.subtitle
            SET s.text = subtitle.text
            SET s.page_number = subtitle.page_number
            MERGE (t)-[:HAS_SUBTITLE]->(s)

            WITH s, subtitle
            UNWIND subtitle.subclauses AS subclause
                MERGE (sc:Subclause {label: subclause.label, text: subclause.text})
                SET sc.text = subclause.text
                SET sc.full_text = subclause.full_text
                SET sc.page_number = subclause.page_number
                SET sc.rank = subclause.rank
                MERGE (s)-[:HAS_SUBCLAUSE]->(sc)

            WITH s, subtitle
                FOREACH (def_key IN keys(subtitle.definitions) |
                    FOREACH (item IN subtitle.definitions[def_key] |
                        MERGE (d:Definition {term: def_key, label: item.label})
                        SET d.text = item.text
                        SET d.full_text = item.full_text
                        SET d.page_number = item.page_number
                        SET d.rank = item.rank
                        MERGE (s)-[:HAS_DEFINITION]->(d)
            )
            )
    """, clause_title = clause["title"],
         full_title = clause["full_title"],
         text = clause["text"],
         page_number = clause["page_number"],
         clause_number = clause["number"],
         subtitles = clause["subtitles"])

# Load into Neo4j
with driver.session() as session:
    for clause in data["clauses"]:
        session.write_transaction(create_graph, clause)

driver.close()

  session.write_transaction(create_graph, clause)


In [None]:
# Create FULLTEXT INDEXES

"""CREATE FULLTEXT INDEX title_subtitle_labels IF NOT EXISTS
FOR (n:Title|Subtitle)
ON EACH [n.label, n.text]"""

#KEYWORD SEARCH
"""CALL db.index.fulltext.queryNodes("title_subtitle_labels", "Confidential, co") YIELD node, score
RETURN node.label, node.text, score """