In [None]:
import pdfplumber
import re
import json
from utils import is_toc_page
from models.title import Title
from models.subtitle import Subtitle
from models.subclause import Subclause

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
path = ''

In [None]:
raw_text = ''
with pdfplumber.open(path) as pdf:
    for i, page in enumerate(pdf.pages[:97]):
        text = page.extract_text()
        if not is_toc_page(text):
            raw_text += f"\n<page>{i}</page>\n" + text

In [None]:
print(raw_text)

In [None]:
def extract_clauses_from_text(text):
    # Patterns
    page_number_pattern = re.compile(r"<page>(\d+)</page>")
    title_pattern = re.compile(r"^(\d{1,2})\.\s+(.+)$")
    subtitle_pattern = re.compile(r"^(\d{1,2}(?:\.\d{1,2})?)\s+([A-Z][^\n]+)$")
    # subclause_pattern = re.compile(r"^\s*\(([a-zA-Z]|[ivxlcdm]+|\d+)\)\s+(.*?)(?=\n\s*\([a-zA-Zivxlcdm\d]+\)\s+|\Z)", re.IGNORECASE)
    subclause_pattern = re.compile(r"""^\s*(\([a-zA-Zivxlcdm\d]+\)|\d+\.\d+(?:\.\d+)*)\s+(.*?)(?=\n\s*(\([a-zA-Zivxlcdm\d]+\)|\d+\.\d+(?:\.\d+)*)\s+|\Z)""",re.IGNORECASE)
    definition_pattern = re.compile(r'^"([^"]+)"\s+means:?\s*$')

    current_page_number = 0
    current_title = None
    current_definition = None
    subclauses = []
    definitions = []
    subtitles = []
    clauses = [{
                "number": "",
                "title": "Cover page and Borrowers",
                "full_title": "",
                "text": "",
                "subtitles": [{
                    'number': "",
                    'subtitle':"no_subtitle",
                    'full_subtitle': "",
                    'subclauses': subclauses,
                    'text': '',
                    'definitions' : {},
                    "page_number": current_page_number,
                }],
                "page_number": current_page_number,
            }]

    # Split text into lines
    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Matches
        page_number_match = page_number_pattern.match(line)
        title_match = title_pattern.match(line)
        subtitle_match = subtitle_pattern.match(line)
        subclause_match = subclause_pattern.match(line)
        definition_match = definition_pattern.match(line)


        if page_number_match:
            current_page_number = page_number_match.group(1)
            continue

        if title_match:
            subtitles = []
            subclauses = []
            current_definition = None
            current_title = {
                "number": title_match.group(1),
                "title": title_match.group(2).strip(),
                "full_title": line,
                "text": "",
                "subtitles": subtitles,
                "page_number": current_page_number,
            }
            clauses.append(current_title)

        elif subtitle_match and current_title:
            subclauses = []
            current_definition = None
            subtitles.append({
                "number": subtitle_match.group(1),
                "subtitle": subtitle_match.group(2),
                "full_subtitle": line,
                "subclauses": subclauses,
                "text": "",
                "definitions" : {},
                "page_number": current_page_number,
            })
            clauses[-1]['subtitles'] = subtitles

        elif subclause_match and clauses[-1]['subtitles']:
            subclause = {
                "label": subclause_match.group(1),
                "text": subclause_match.group(2).strip(),
                "full_text": line,
                "page_number": current_page_number
            }

            if current_definition:
                subclause["rank"] = len(definitions)
                definitions.append(subclause)
                clauses[-1]['subtitles'][-1]['definitions'][current_definition] = definitions
            else:
                subclause["rank"] = len(subclauses)
                subclauses.append(subclause)
                clauses[-1]['subtitles'][-1]['subclauses'] = subclauses

        elif definition_match and clauses[-1]['subtitles']:
            current_definition = line
            definitions = []
            clauses[-1]['subtitles'][-1]['definitions'][current_definition] = []

        elif subclauses:
            clauses[-1]['subtitles'][-1]['subclauses'][-1]['text'] += "\n" + line

        elif definitions and clauses[-1]['subtitles'] and current_definition:
            clauses[-1]['subtitles'][-1]['definitions'][current_definition][-1]['text'] += "\n" + line

        elif subtitles:
            clauses[-1]['subtitles'][-1]['text'] += "\n" + line
        else:
            clauses[-1]['text'] += "\n" + line

    return clauses

In [None]:
def extract_clauses_from_text(text):
    # Patterns
    page_number_pattern = re.compile(r"<page>(\d+)</page>")
    title_pattern = re.compile(r"^(\d{1,2})\.\s+(.+)$")
    subtitle_pattern = re.compile(r"^(\d{1,2}(?:\.\d{1,2})?)\s+([A-Z][^\n]+)$")
    # subclause_pattern = re.compile(r"^\s*\(([a-zA-Z]|[ivxlcdm]+|\d+)\)\s+(.*?)(?=\n\s*\([a-zA-Zivxlcdm\d]+\)\s+|\Z)", re.IGNORECASE)
    subclause_pattern = re.compile(r"""^\s*(\([a-zA-Zivxlcdm\d]+\)|\d+\.\d+(?:\.\d+)*)\s+(.*?)(?=\n\s*(\([a-zA-Zivxlcdm\d]+\)|\d+\.\d+(?:\.\d+)*)\s+|\Z)""",re.IGNORECASE)
    definition_pattern = re.compile(r'^"([^"]+)"\s+means:?\s*$')

    current_page_number = 0
    current_title = None
    current_definition = None
    subclauses = []
    definitions = []
    subtitles = []
    clauses = [
        Title(
            title="Cover page and Borrowers",
            page_number=current_page_number,
            subtitles=[
                Subtitle(
                    subtitle="no_subtitle",
                    page_number=current_page_number,
                    subclauses=subclauses
                )
            ]
            )
        ]

    # Split text into lines
    lines = text.split("\n")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Matches
        page_number_match = page_number_pattern.match(line)
        title_match = title_pattern.match(line)
        subtitle_match = subtitle_pattern.match(line)
        subclause_match = subclause_pattern.match(line)
        definition_match = definition_pattern.match(line)


        if page_number_match:
            current_page_number = page_number_match.group(1)
            continue

        if title_match:
            subtitles = []
            subclauses = []
            current_definition = None
            current_title = Title(
                title=line,
                subtitles=subtitles,
                page_number=current_page_number
            )
            clauses.append(current_title)

        elif subtitle_match and current_title:
            subclauses = []
            current_definition = None
            subtitles.append(Subtitle(
                subtitle=line,
                subclauses=subclauses,
                page_number=current_page_number
            ))
            clauses[-1].subtitles = subtitles

        elif subclause_match and clauses[-1].subtitles:
            subclause = Subclause(
                label=subclause_match.group(1),
                text=line,
                page_number=current_page_number,
            )

            if current_definition:
                subclause.rank = len(definitions)
                definitions.append(subclause)
                clauses[-1].subtitles[-1].definitions[current_definition] = definitions
            else:
                subclause.rank = len(subclauses)
                subclauses.append(subclause)
                clauses[-1].subtitles[-1].subclauses = subclauses

        elif definition_match and clauses[-1].subtitles:
            current_definition = line
            definitions = []
            clauses[-1].subtitles[-1].definitions[current_definition] = []

        elif subclauses:
            clauses[-1].subtitles[-1].subclauses[-1].text += "\n" + line

        elif definitions and clauses[-1].subtitles and current_definition:
            clauses[-1].subtitles[-1].definitions[current_definition][-1].text += "\n" + line

        elif subtitles:
            clauses[-1].subtitles[-1].text += "\n" + line
        else:
            clauses[-1].text += "\n" + line

    return clauses

In [None]:
from dataclasses import asdict
structured_data = {"clauses": [asdict(clause) for clause in extract_clauses_from_text(raw_text)]}
output_json_path = "output_clauses_v2.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(structured_data, f, indent=2, ensure_ascii=False)

    print(f"✅ JSON saved to: {output_json_path}")



In [None]:
# Load json file

with open(output_json_path, 'r') as f:
    data = json.load(f)

In [None]:
for index, clause in enumerate(data['clauses']):
    print(index, clause.keys())

In [None]:
for subtitle in data['clauses'][31]['subtitles']:
    print(subtitle.keys())

In [None]:
# Store data into NEO4J database
from utils import get_database_driver

driver = get_database_driver()

In [None]:
delete_query = "MATCH (n) DETACH DELETE n"
driver.execute_query(delete_query)

In [None]:
def create_graph(tx, clause):
    tx.run("""
        MERGE (t:Title {title: $clause_title})
        SET t.text = $text
        SET t.page_number = $page_number

        WITH t
        UNWIND $subtitles AS subtitle
            MERGE (s:Subtitle {subtitle: subtitle.subtitle})
            SET s.text = subtitle.text
            SET s.page_number = subtitle.page_number
            MERGE (t)-[:HAS_SUBTITLE]->(s)

            WITH s, subtitle
            UNWIND subtitle.subclauses AS subclause
                MERGE (sc:Subclause {label: subclause.label, text: subclause.text})
                SET sc.text = subclause.text
                SET sc.page_number = subclause.page_number
                SET sc.rank = subclause.rank
                MERGE (s)-[:HAS_SUBCLAUSE]->(sc)

            WITH s, subtitle
                FOREACH (def_key IN keys(subtitle.definitions) |
                    FOREACH (item IN subtitle.definitions[def_key] |
                        MERGE (d:Definition {term: def_key, label: item.label})
                        SET d.text = item.text
                        SET d.page_number = item.page_number
                        SET d.rank = item.rank
                        MERGE (s)-[:HAS_DEFINITION]->(d)
            )
            )
    """, clause_title = clause["title"],
         text = clause["text"],
         page_number = clause["page_number"],
         subtitles = clause["subtitles"])

# Load into Neo4j
with driver.session() as session:
    for clause in data["clauses"]:
        session.execute_write(create_graph, clause)


In [None]:
# Create FULLTEXT INDEXES

"""CREATE FULLTEXT INDEX title_subtitle_labels IF NOT EXISTS
FOR (n:Title|Subtitle)
ON EACH [n.label, n.text]"""

#KEYWORD SEARCH
"""CALL db.index.fulltext.queryNodes("title_subtitle_labels", "Confidential, co") YIELD node, score
RETURN node.label, node.text, score """

In [None]:
query = """
    MATCH (n)
    RETURN count(n) as number_of_nodes
"""

result = driver.execute_query(query)

for record in result.records:
    print(record["number_of_nodes"])


In [None]:
query = """
    MATCH (title: Title {title: "31. CONFIDENTIALITY"})
    RETURN title
"""

result = driver.execute_query(query)

for record in result.records:
    print(record["title"])


In [None]:
query = """
    MATCH (title: Title)-[:HAS_SUBTITLE]->(subtitle: Subtitle)
    WHERE title.label contains 'GENERAL DATA PROTECTION REGULATION'
    RETURN title.label as tl, subtitle.label as sl ORDER BY tl, sl
"""

result = driver.execute_query(query)

for record in result.records:
    print(record)

In [None]:
query = """
MATCH (start:Title)
WHERE start.title CONTAINS "Cover page and Borrowers"
CALL apoc.path.spanningTree(
  start,
  {
    relationshipFilter: "HAS_SUBTITLE>|HAS_SUBCLAUSE>|HAS_DEFINITION>",
    labelFilter: "+Subtitle|+Subclause|+Definition",
    maxLevel: 10
  }
)
YIELD path
WITH path, nodes(path) AS nds
UNWIND nds AS node
RETURN DISTINCT node
ORDER BY
  coalesce(node.page_number, 0) ASC,
  coalesce(node.rank, 0) ASC

"""

result = driver.execute_query(query)

for record in result.records:
  node = record["node"]

  node_type = list(node.labels)[0]
  tt = eval(f"{node_type}.from_node(node)")
  print(tt.describe())

  # # print(record)
  # if node_type == "Title":
  #   title = Title.from_node(node)
  #   print(title.describe())

  # if node_type == "Subtitle":
  #   subtitle = Subtitle.from_node(node)
  #   print(subtitle.describe())

  # if node_type == "Subclause":
  #   subclause = Subclause.from_node(node)
  #   print(subclause.describe())


In [None]:
from utils import load_yaml_file

In [None]:
datapoint = load_yaml_file(file_path="/Users/ngam/dev/text-clustering/datapoints/BorrowerJurisdiction.yaml")
datapoint

In [None]:
title_index_query = """
CREATE FULLTEXT INDEX title_index IF NOT EXISTS
FOR (t:Title)
ON EACH [t.title, t.text]
"""
driver.execute_query(title_index_query)

In [None]:
subtitle_index_query = """
CREATE FULLTEXT INDEX subtitle_index IF NOT EXISTS
FOR (s:Subtitle)
ON EACH [s.subtitle, s.text]
"""
driver.execute_query(subtitle_index_query)

In [None]:
subclause_index_query = """
CREATE FULLTEXT INDEX subclause_index IF NOT EXISTS
FOR (sc:SubClause)
ON EACH [sc.text]
"""
driver.execute_query(subclause_index_query)

In [None]:
def search_by_title(title):
    query = f"""
        MATCH (start:Title)
        WHERE start.title CONTAINS "{title}"
        CALL apoc.path.spanningTree(
        start,
        {{
            relationshipFilter: "HAS_SUBTITLE>|HAS_SUBCLAUSE>|HAS_DEFINITION>",
            labelFilter: "+Subtitle|+Subclause|+Definition",
            maxLevel: 10
        }}
        )
        YIELD path
        WITH path, nodes(path) AS nds
        UNWIND nds AS node
        RETURN DISTINCT node
        ORDER BY
        coalesce(node.page_number, 0) ASC,
        coalesce(node.rank, "") ASC
    """
    result = driver.execute_query(query)
    print(result)
    eval("")
    titles = [Title.from_node(record["node"]) for record in result.records]
    for t in titles:
        print(t.describe())

In [None]:
def extract_datapoint(datapoint):
   for title in datapoint["search"]["titles"]:
       search_by_title(title)

extract_datapoint(datapoint)