In [6]:
# Load dotenv
from dotenv import load_dotenv
import os

load_dotenv()

LEGISLATION_URL_PREFIX = os.getenv('LEGISLATION_URL_PREFIX')
LEGISLATION_URI_LIST_FILE = os.getenv('LEGISLATION_URI_LIST_FILE')
JSON_OUTPUT_DIR = os.getenv('JSON_OUTPUT_DIR', 'json_out')
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')

In [7]:
# Initialize pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, DoubleType
from pyspark.sql.window import Window

# Initialize Spark with Neo4j Connector
neo4j_maven_pkg = "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.10_for_spark_3"
spark = (
    SparkSession.builder.appName("PSC_Loader_Spark")
    .config("spark.jars.packages", neo4j_maven_pkg)
    .config("spark.driver.memory", "8g")
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.user", NEO4J_USER)
    .config("neo4j.authentication.basic.password", NEO4J_PASSWORD)
    .config("neo4j.database", NEO4J_DATABASE)
    .getOrCreate()
)

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Check Spark and Connector versions
print(f"Spark version: {spark.version}")
print(f"Scala version: {spark.sparkContext.version.split('.')[1]}")
print(f"Neo4j Connector version: {neo4j_maven_pkg.split(':')[2]}")

Spark version: 3.5.1
Scala version: 5
Neo4j Connector version: 5.3.10_for_spark_3


In [8]:

from pyspark.sql.functions import col, explode_outer, concat, lit, coalesce, md5, to_date, regexp_replace

class LegislationGraphLoader:
    def __init__(self, uri, user, password, json_output_dir):
        self.uri = uri
        self.user = user
        self.password = password
        self.json_output_dir = json_output_dir

    def _write_legislation_nodes(self, raw_df):
        print("Writing Legislation Nodes...")
        select_exprs = [
            col("legislation_url").alias("uri"),
            col("identifier.title").alias("title"),
            col("identifier.description").alias("description")
        ]
        
        if "identifier" in raw_df.columns and "modified" in raw_df.schema["identifier"].dataType.fieldNames():
            select_exprs.append(to_date(col("identifier.modified"), "yyyy-MM-dd").alias("modified_date"))
        else:
            select_exprs.append(lit(None).cast("date").alias("modified_date"))
            
        if "identifier" in raw_df.columns and "valid_date" in raw_df.schema["identifier"].dataType.fieldNames():
            select_exprs.append(to_date(col("identifier.valid_date"), "yyyy-MM-dd").alias("valid_date"))
        else:
            select_exprs.append(lit(None).cast("date").alias("valid_date"))

        if "metadata" in raw_df.columns and "enactment_date" in raw_df.schema["metadata"].dataType.fieldNames():
            select_exprs.append(to_date(col("metadata.enactment_date"), "yyyy-MM-dd").alias("enactment_date"))
        else:
            select_exprs.append(lit(None).cast("date").alias("enactment_date"))

        if "metadata" in raw_df.columns and "status" in raw_df.schema["metadata"].dataType.fieldNames():
            select_exprs.append(col("metadata.status").alias("status"))
        else:
            select_exprs.append(lit(None).alias("status"))

        if "metadata" in raw_df.columns and "category" in raw_df.schema["metadata"].dataType.fieldNames():
            select_exprs.append(col("metadata.category").alias("category"))
        else:
            select_exprs.append(lit(None).alias("category"))

        if "metadata" in raw_df.columns and "coming_into_force" in raw_df.schema["metadata"].dataType.fieldNames():
            select_exprs.append(to_date(col("metadata.coming_into_force"), "yyyy-MM-dd").alias("coming_into_force"))
        else:
            select_exprs.append(lit(None).cast("date").alias("coming_into_force"))

        legis_df = raw_df.select(*select_exprs).dropDuplicates(["uri"])

        legis_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MERGE (l:Legislation {uri: row.uri})
            SET l.title = row.title, 
                l.description = row.description,
                l.modified_date = row.modified_date,
                l.valid_date = row.valid_date,
                l.enactment_date = row.enactment_date,
                l.status = row.status,
                l.category = row.category,
                l.coming_into_force = row.coming_into_force
        """).save()

    def _write_part_nodes(self, raw_df):
        print("Writing Part Nodes...")
        parts_df = raw_df.select(
            col("legislation_url").alias("legis_uri"),
            explode_outer("parts").alias("part")
        ).filter(col("part").isNotNull()) \
         .withColumn("part_id", concat(col("legis_uri"), lit("#part_"), coalesce(col("part.part_number"), md5(col("part").cast("string")))))

        parts_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (l:Legislation {uri: row.legis_uri})
            MERGE (p:Part {id: row.part_id})
            SET p.number = row.`part.part_number`,
                p.order = row.`part.order`,
                p.title = row.`part.title`,
                p.uri = row.`part.uri`,
                p.status = row.`part.status`,
                p.restrict_start_date = row.`part.restrict_start_date`,
                p.restrict_end_date = row.`part.restrict_end_date`
            MERGE (l)-[:HAS_PART]->(p)
        """).save()
        return parts_df

    def _write_chapter_nodes(self, parts_df):
        print("Writing Chapter Nodes...")
        chapters_df = parts_df.select(
            col("part_id"),
            explode_outer("part.chapters").alias("chapter")
        ).filter(col("chapter").isNotNull()) \
         .withColumn("chapter_id", coalesce(col("chapter.uri"), concat(col("part_id"), lit("#chapter_"), coalesce(col("chapter.chapter_number"), md5(col("chapter").cast("string"))))))

        chapters_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (p:Part {id: row.part_id})
            MERGE (c:Chapter {id: row.chapter_id})
            SET c.number = row.`chapter.chapter_number`, 
                c.order = row.`chapter.order`,
                c.title = row.`chapter.title`,
                c.uri = row.`chapter.uri`,
                c.status = row.`chapter.status`,
                c.restrict_start_date = date(row.`chapter.restrict_start_date`),
                c.restrict_end_date = date(row.`chapter.restrict_end_date`)
            MERGE (p)-[:HAS_CHAPTER]->(c)
        """).save()
        return chapters_df

    def _write_section_nodes(self, chapters_df):
        print("Writing Section Nodes...")
        sections_df = chapters_df.select(
            col("chapter_id"),
            explode_outer("chapter.sections").alias("section")
        ).filter(col("section").isNotNull()) \
         .withColumn("sec_id", coalesce(col("section.uri"), concat(col("chapter_id"), lit("#sec_"), coalesce(col("section.section_number"), md5(col("section").cast("string"))))))
        
        sections_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (c:Chapter {id: row.chapter_id})
            MERGE (s:Section {id: row.sec_id})
            SET s.number = row.`section.section_number`, 
                s.order = row.`section.order`,
                s.title = row.`section.title`, 
                s.uri = row.`section.uri`,
                s.restrict_extent = row.`section.restrict_extent`,
                s.restrict_start_date = date(row.`section.restrict_start_date`),
                s.restrict_end_date = date(row.`section.restrict_end_date`)
            MERGE (c)-[:HAS_SECTION]->(s)
        """).save()
        return sections_df

    def _write_paragraph_nodes(self, sections_df):
        print("Writing Paragraph Nodes...")
        paragraphs_df = sections_df.select(
            col("sec_id"),
            explode_outer("section.paragraphs").alias("paragraph")
        ).filter(col("paragraph").isNotNull()) \
         .withColumn("para_id", coalesce(col("paragraph.uri"), concat(col("sec_id"), lit("#para_"), coalesce(col("paragraph.paragraph_number"), md5(col("paragraph").cast("string"))))))

        paragraphs_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (s:Section {id: row.sec_id})
            MERGE (pa:Paragraph {id: row.para_id})
            SET pa.number = row.`paragraph.paragraph_number`,
                pa.order = row.`paragraph.order`,
                pa.text = row.`paragraph.text`, 
                pa.uri = row.`paragraph.uri`
            MERGE (s)-[:HAS_PARAGRAPH]->(pa)
        """).save()
        return paragraphs_df

    def _write_schedules_nodes(self, raw_df):
        if "schedules" not in raw_df.columns or raw_df.schema["schedules"].dataType.simpleString() == 'array<string>':
            return None, None, None

        print("Writing Schedule Nodes...")
        schedules_df = raw_df.select(
            col("legislation_url").alias("legis_uri"),
            explode_outer("schedules").alias("schedule")
        ).filter(col("schedule").isNotNull()) \
         .withColumn("sched_id", coalesce(col("schedule.uri"), concat(col("legis_uri"), lit("#sched_"), coalesce(col("schedule.schedule_number"), md5(col("schedule").cast("string"))))))
         
        schedules_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (l:Legislation {uri: row.legis_uri})
            MERGE (sc:Schedule {id: row.sched_id})
            SET sc.number = row.`schedule.schedule_number`,
                sc.order = row.`schedule.order`,
                sc.title = row.`schedule.title`,
                sc.reference = row.`schedule.reference`,
                sc.uri = row.`schedule.uri`
            MERGE (l)-[:HAS_SCHEDULE]->(sc)
        """).save()

        print("Writing Schedule Paragraph Nodes...")
        sched_paras_df = schedules_df.select(
            col("sched_id"),
            explode_outer("schedule.paragraphs").alias("paragraph")
        ).filter(col("paragraph").isNotNull()) \
         .withColumn("para_id", coalesce(col("paragraph.uri"), concat(col("sched_id"), lit("#spara_"), coalesce(col("paragraph.paragraph_number"), md5(col("paragraph").cast("string"))))))

        sched_paras_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (sc:Schedule {id: row.sched_id})
            MERGE (p:ScheduleParagraph {id: row.para_id})
            SET p.number = row.`paragraph.paragraph_number`,
                p.order = row.`paragraph.order`,
                p.crossheading = row.`paragraph.crossheading`,
                p.text = row.`paragraph.text`,
                p.uri = row.`paragraph.uri`
            MERGE (sc)-[:HAS_PARAGRAPH]->(p)
        """).save()

        sched_para_comm_df = sched_paras_df.select(col("para_id").alias("parent_id"), explode_outer("paragraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

        sched_subpara_comm_df = None
        if "subparagraphs" in sched_paras_df.schema["paragraph"].dataType.fieldNames():
            print("Writing Schedule Sub-paragraph Nodes...")
            sched_subparas_df = sched_paras_df.select(
                col("para_id"),
                explode_outer("paragraph.subparagraphs").alias("subparagraph")
            ).filter(col("subparagraph").isNotNull()) \
             .withColumn("subpara_id", coalesce(col("subparagraph.uri"), concat(col("para_id"), lit("#ssub_"), coalesce(col("subparagraph.subparagraph_number"), md5(col("subparagraph").cast("string"))))))

            sched_subparas_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
                UNWIND event AS row
                MATCH (p:ScheduleParagraph {id: row.para_id})
                MERGE (sp:ScheduleSubparagraph {id: row.subpara_id})
                SET sp.number = row.`subparagraph.subparagraph_number`,
                    sp.order = row.`subparagraph.order`,
                    sp.text = row.`subparagraph.text`,
                    sp.uri = row.`subparagraph.uri`
                MERGE (p)-[:HAS_SUBPARAGRAPH]->(sp)
            """).save()

            sched_subpara_comm_df = sched_subparas_df.select(col("subpara_id").alias("parent_id"), explode_outer("subparagraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

        return sched_paras_df, sched_para_comm_df, sched_subpara_comm_df

    def _write_single_commentary(self, df, parent_label):
        if df is not None:
            fields = df.schema["commentary"].dataType.fieldNames()
            type_col = col("commentary.type") if "type" in fields else lit(None)
            text_col = col("commentary.text") if "text" in fields else lit(None)
            
            flat_df = df.select(
                col("parent_id"),
                col("commentary.ref_id").alias("ref_id"),
                type_col.alias("type"),
                text_col.alias("text")
            ).filter(col("ref_id").isNotNull()).dropDuplicates(["parent_id", "ref_id"])
            
            flat_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", f"""
                UNWIND event AS row
                WITH row WHERE row.ref_id IS NOT NULL
                MATCH (parent:{parent_label} {{id: row.parent_id}})
                MERGE (com:Commentary {{id: row.ref_id}})
                SET com.type = row.type, com.text = row.text
                MERGE (parent)-[:HAS_COMMENTARY]->(com)
            """).save()

    def _write_commentaries(self, sec_comm_df, para_comm_df, sched_para_comm_df, sched_subpara_comm_df):
        print("Writing Commentary Nodes...")
        self._write_single_commentary(sec_comm_df, "Section")
        self._write_single_commentary(para_comm_df, "Paragraph")
        self._write_single_commentary(sched_para_comm_df, "ScheduleParagraph")
        self._write_single_commentary(sched_subpara_comm_df, "ScheduleSubparagraph")

    def _write_citations(self, all_comms):
        comm_fields = all_comms.schema["commentary"].dataType.fieldNames()
        if "citations" not in comm_fields:
            return
            
        print("Writing Citation Nodes... (Sequential & Strict Match)")
        citations_df = all_comms.select(
            col("commentary.ref_id").alias("comm_id"),
            explode_outer("commentary.citations").alias("citation")
        ).filter(col("citation").isNotNull()) \
         .filter(col("citation.uri").isNotNull())
        
        cit_fields = citations_df.schema["citation"].dataType.fieldNames()
        
        citations_flat = citations_df.select(
            col("comm_id"),
            col("citation.id").alias("cit_id") if "id" in cit_fields else lit(None).alias("cit_id"),
            col("citation.uri").alias("cit_uri") if "uri" in cit_fields else lit(None).alias("cit_uri"),
            col("citation.title").alias("cit_title") if "title" in cit_fields else lit(None).alias("cit_title"),
            col("citation.year").alias("cit_year") if "year" in cit_fields else lit(None).alias("cit_year"),
            col("citation.class").alias("cit_class") if "class" in cit_fields else lit(None).alias("cit_class"),
            col("citation.text").alias("cit_text") if "text" in cit_fields else lit(None).alias("cit_text")
        ).filter(col("cit_id").isNotNull()).dropDuplicates(["comm_id", "cit_id"])

        citations_flat = citations_flat.withColumn("norm_uri", regexp_replace(col("cit_uri"), r"/id/", "/"))
        citations_flat = citations_flat.coalesce(1)

        citations_flat.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            WITH row WHERE row.comm_id IS NOT NULL AND row.cit_id IS NOT NULL
            
            MATCH (com:Commentary {id: row.comm_id})
            MERGE (cit:Citation {id: row.cit_id})
            SET cit.uri = row.cit_uri,
                cit.title = row.cit_title,
                cit.year = row.cit_year,
                cit.class = row.cit_class,
                cit.text = row.cit_text
            MERGE (com)-[:HAS_CITATION]->(cit)
            
            WITH cit, row
            WHERE row.norm_uri IS NOT NULL
            MATCH (leg:Legislation {uri: row.norm_uri})
            MERGE (cit)-[:CITES_ACT]->(leg)
        """).save()

    def _write_citation_subrefs(self, all_comms):
        comm_fields = all_comms.schema["commentary"].dataType.fieldNames()
        if "citation_subrefs" not in comm_fields:
            return
            
        print("Writing Citation SubRefs... (Sequential & Strict Match)")
        subrefs_df = all_comms.select(
            col("commentary.ref_id").alias("comm_id"),
            explode_outer("commentary.citation_subrefs").alias("subref")
        ).filter(col("subref").isNotNull()) \
         .filter(col("subref.uri").isNotNull())
        
        sub_fields = subrefs_df.schema["subref"].dataType.fieldNames()

        subrefs_flat = subrefs_df.select(
            col("comm_id"),
            col("subref.id").alias("sub_id") if "id" in sub_fields else lit(None).alias("sub_id"),
            col("subref.citation_ref").alias("citation_ref") if "citation_ref" in sub_fields else lit(None).alias("citation_ref"),
            col("subref.uri").alias("sub_uri") if "uri" in sub_fields else lit(None).alias("sub_uri"),
            col("subref.section_ref").alias("sub_section_ref") if "section_ref" in sub_fields else lit(None).alias("sub_section_ref"),
            col("subref.text").alias("sub_text") if "text" in sub_fields else lit(None).alias("sub_text")
        ).filter(col("sub_id").isNotNull()).dropDuplicates(["comm_id", "sub_id"])

        subrefs_flat = subrefs_flat.withColumn("base_uri", 
            regexp_replace(col("sub_uri"), r"(http://www\.legislation\.gov\.uk)/id/([^/]+/[0-9]+/[0-9]+).*", "$1/$2")
        )
        subrefs_flat = subrefs_flat.coalesce(1)

        subrefs_flat.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            WITH row WHERE row.comm_id IS NOT NULL AND row.sub_id IS NOT NULL
            
            MERGE (sub:CitationSubRef {id: row.sub_id})
            SET sub.uri = row.sub_uri, 
                sub.section_ref = row.sub_section_ref, 
                sub.text = row.sub_text
                
            WITH sub, row
            MATCH (com:Commentary {id: row.comm_id})
            OPTIONAL MATCH (cit:Citation {id: row.citation_ref})
            
            FOREACH (_ IN CASE WHEN cit IS NOT NULL THEN [1] ELSE [] END |
                MERGE (cit)-[:HAS_SUBREF]->(sub)
            )
            FOREACH (_ IN CASE WHEN cit IS NULL THEN [1] ELSE [] END |
                MERGE (com)-[:HAS_SUBREF]->(sub)
            )
            
            WITH sub, row
            WHERE row.base_uri IS NOT NULL
            MATCH (leg:Legislation {uri: row.base_uri})
            MERGE (sub)-[:REFERENCES]->(leg)
        """).save()

    def _write_super_relationships(self, raw_df):
        if "super" not in raw_df.columns:
            return

        print("Writing Super Relationships...")
        super_fields = raw_df.schema["super"].dataType.fieldNames()
        super_df = raw_df.select(
            col("legislation_url").alias("legis_uri"),
            col("super.supersedes").alias("supersedes") if "supersedes" in super_fields else lit(None).alias("supersedes"),
            col("super.superseded_by").alias("superseded_by") if "superseded_by" in super_fields else lit(None).alias("superseded_by")
        )

        super_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """            UNWIND event AS row
            WITH row WHERE row.legis_uri IS NOT NULL
            MATCH (l:Legislation {uri: row.legis_uri})
            
            FOREACH (_ IN CASE WHEN row.supersedes IS NOT NULL THEN [1] ELSE [] END |
                MERGE (target:Legislation {uri: row.supersedes})
                MERGE (l)-[:SUPERSEDES]->(target)
            )
            
            FOREACH (_ IN CASE WHEN row.superseded_by IS NOT NULL THEN [1] ELSE [] END |
                MERGE (target:Legislation {uri: row.superseded_by})
                MERGE (l)-[:SUPERSEDED_BY]->(target)
            )
        """).save()

    def _write_explanatory_notes_nodes(self, raw_df):
        if "explanatory_notes" not in raw_df.columns:
            return None
            
        print("Writing Explanatory Notes Nodes...")
        notes_base_df = raw_df.select(
            col("legislation_url").alias("legis_uri"),
            col("explanatory_notes")
        ).filter(col("explanatory_notes").isNotNull()) \
         .withColumn("notes_id", coalesce(col("explanatory_notes.uri"), md5(col("explanatory_notes").cast("string"))))
        
        notes_base_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (l:Legislation {uri: row.legis_uri}) 
            MERGE (en:ExplanatoryNotes {id: row.notes_id})
            SET en.uri = row.`explanatory_notes.uri`
            MERGE (l)-[:HAS_EXPLANATORY_NOTES]->(en)
        """).save()
        
        notes_paras_df = notes_base_df.select(
            col("explanatory_notes.uri").alias("notes_id"),
            col("legis_uri"),
            explode_outer("explanatory_notes.paragraphs").alias("paragraph")
        ).filter(col("paragraph").isNotNull()) \
         .withColumn("para_id", concat(col("notes_id"), lit("#enp_"), md5(col("paragraph.text").cast("string"))))
         
        notes_paras_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (en:ExplanatoryNotes {id: row.notes_id})
            MERGE (p:ExplanatoryNotesParagraph {id: row.para_id})
            SET p.text = row.`paragraph.text`,
                p.uri = row.`paragraph.uri`
            MERGE (en)-[:HAS_PARAGRAPH]->(p)
        """).save()
        
        return notes_paras_df

    def _write_explanatory_notes_citations(self, notes_paras_df):
        if notes_paras_df is None:
            return
            
        comm_fields = notes_paras_df.schema["paragraph"].dataType.fieldNames()
        if "citations" not in comm_fields:
            return
            
        print("Writing Explanatory Notes Citation Nodes... (Sequential & Strict Match)")
        citations_df = notes_paras_df.select(
            col("para_id"),
            explode_outer("paragraph.citations").alias("citation")
        ).filter(col("citation").isNotNull())
        
        cit_fields = citations_df.schema["citation"].dataType.fieldNames()
        
        citations_flat = citations_df.select(
            col("para_id"),
            col("citation.id").alias("cit_id") if "id" in cit_fields else lit(None).alias("cit_id"),
            col("citation.uri").alias("cit_uri") if "uri" in cit_fields else lit(None).alias("cit_uri"),
            col("citation.title").alias("cit_title") if "title" in cit_fields else lit(None).alias("cit_title"),
            col("citation.year").alias("cit_year") if "year" in cit_fields else lit(None).alias("cit_year"),
            col("citation.class").alias("cit_class") if "class" in cit_fields else lit(None).alias("cit_class"),
            col("citation.text").alias("cit_text") if "text" in cit_fields else lit(None).alias("cit_text")
        ).filter(col("cit_id").isNotNull()).dropDuplicates(["para_id", "cit_id"])

        citations_flat = citations_flat.withColumn("norm_uri", regexp_replace(col("cit_uri"), r"/id/", "/"))
        citations_flat = citations_flat.coalesce(1)

        citations_flat.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            WITH row WHERE row.para_id IS NOT NULL AND row.cit_id IS NOT NULL
            
            MATCH (p:ExplanatoryNotesParagraph {id: row.para_id})
            MERGE (cit:Citation {id: row.cit_id})
            SET cit.uri = row.cit_uri,
                cit.title = row.cit_title,
                cit.year = row.cit_year,
                cit.class = row.cit_class,
                cit.text = row.cit_text
            MERGE (p)-[:HAS_CITATION]->(cit)
            
            WITH cit, row
            WHERE row.norm_uri IS NOT NULL
            MATCH (leg:Legislation {uri: row.norm_uri})
            MERGE (cit)-[:CITES_ACT]->(leg)
        """).save()

    def load_full_hierarchy_to_neo4j(self, json_dir=None):
        if json_dir is None:
            json_dir = f"{self.json_output_dir}/*/*.json"
            
        spark = SparkSession.builder \
            .appName("Legislation Full Graph Builder") \
            .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.2_for_spark_3") \
            .config("neo4j.url", self.uri) \
            .config("neo4j.authentication.basic.username", self.user) \
            .config("neo4j.authentication.basic.password", self.password) \
            .getOrCreate()
            
        raw_df = spark.read \
            .option("multiline", "true") \
            .option("mode", "PERMISSIVE") \
            .option("columnNameOfCorruptRecord", "_corrupt_record") \
            .option("recursiveFileLookup", "true") \
            .option("pathGlobFilter", "*.json") \
            .json(json_dir)

        if "_corrupt_record" in raw_df.columns:
            raw_df = raw_df.filter(col("_corrupt_record").isNull()).drop("_corrupt_record")
            
        raw_df = raw_df.filter(col("legislation_url").isNotNull() & (col("legislation_url") != ""))
        
        self._write_legislation_nodes(raw_df)
        self._write_super_relationships(raw_df)
        parts_df = self._write_part_nodes(raw_df)
        chapters_df = self._write_chapter_nodes(parts_df)
        sections_df = self._write_section_nodes(chapters_df)
        paragraphs_df = self._write_paragraph_nodes(sections_df)
        
        sched_paras_df, sched_para_comm_df, sched_subpara_comm_df = self._write_schedules_nodes(raw_df)
        notes_paras_df = self._write_explanatory_notes_nodes(raw_df)
        self._write_explanatory_notes_citations(notes_paras_df)

        sec_comm_df = sections_df.select(col("sec_id").alias("parent_id"), explode_outer("section.commentaries").alias("commentary")).filter(col("commentary").isNotNull())
        para_comm_df = paragraphs_df.select(col("para_id").alias("parent_id"), explode_outer("paragraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

        self._write_commentaries(sec_comm_df, para_comm_df, sched_para_comm_df, sched_subpara_comm_df)

        all_comms = sec_comm_df.select("commentary").unionByName(para_comm_df.select("commentary"), allowMissingColumns=True)
        if sched_para_comm_df is not None:
            all_comms = all_comms.unionByName(sched_para_comm_df.select("commentary"), allowMissingColumns=True)
        if sched_subpara_comm_df is not None:
            all_comms = all_comms.unionByName(sched_subpara_comm_df.select("commentary"), allowMissingColumns=True)

        self._write_citations(all_comms)
        self._write_citation_subrefs(all_comms)

        print("Graph load complete!")


In [9]:
from neo4j import GraphDatabase

def setup_neo4j_constraints(uri, user, password, database):
    """
    Connects directly to Neo4j to ensure unique constraints and indexes exist 
    before Spark starts pushing data. This prevents duplicate nodes and makes MERGE fast.
    """
    print("Setting up Neo4j constraints...")
    constraints = [
        "CREATE CONSTRAINT leg_uri_unique IF NOT EXISTS FOR (l:Legislation) REQUIRE l.uri IS UNIQUE;",
        "CREATE CONSTRAINT part_id_unique IF NOT EXISTS FOR (p:Part) REQUIRE p.id IS UNIQUE;",
        "CREATE CONSTRAINT chap_id_unique IF NOT EXISTS FOR (c:Chapter) REQUIRE c.id IS UNIQUE;",
        "CREATE CONSTRAINT sec_id_unique IF NOT EXISTS FOR (s:Section) REQUIRE s.id IS UNIQUE;",
        "CREATE CONSTRAINT para_id_unique IF NOT EXISTS FOR (pa:Paragraph) REQUIRE pa.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_id_unique IF NOT EXISTS FOR (s:Schedule) REQUIRE s.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_para_id_unique IF NOT EXISTS FOR (p:ScheduleParagraph) REQUIRE p.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_subpara_id_unique IF NOT EXISTS FOR (sp:ScheduleSubparagraph) REQUIRE sp.id IS UNIQUE;",
        "CREATE CONSTRAINT com_id_unique IF NOT EXISTS FOR (com:Commentary) REQUIRE com.id IS UNIQUE;",
        "CREATE CONSTRAINT cit_id_unique IF NOT EXISTS FOR (cit:Citation) REQUIRE cit.id IS UNIQUE;",
        "CREATE CONSTRAINT sub_id_unique IF NOT EXISTS FOR (sub:CitationSubRef) REQUIRE sub.id IS UNIQUE;",
        "CREATE CONSTRAINT en_id_unique IF NOT EXISTS FOR (en:ExplanatoryNotes) REQUIRE en.id IS UNIQUE;",
        "CREATE CONSTRAINT ep_id_unique IF NOT EXISTS FOR (ep:ExplanatoryNotesParagraph) REQUIRE ep.id IS UNIQUE;"
    ]
    
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session(database=database) as session:
        for query in constraints:
            session.run(query)
    driver.close()
    print("Constraints successfully applied.\n")

In [10]:
setup_neo4j_constraints(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)
loader = LegislationGraphLoader(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, JSON_OUTPUT_DIR)
loader.load_full_hierarchy_to_neo4j()


Setting up Neo4j constraints...
Constraints successfully applied.



26/02/23 22:18:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Writing Legislation Nodes...


                                                                                

Writing Super Relationships...


                                                                                

Writing Part Nodes...


                                                                                

Writing Chapter Nodes...


                                                                                

Writing Section Nodes...


                                                                                

Writing Paragraph Nodes...


                                                                                

Writing Schedule Nodes...


                                                                                

Writing Schedule Paragraph Nodes...


                                                                                

Writing Schedule Sub-paragraph Nodes...


                                                                                

Writing Explanatory Notes Nodes...


                                                                                

Writing Explanatory Notes Citation Nodes... (Sequential & Strict Match)


                                                                                

Writing Commentary Nodes...


                                                                                

Writing Citation Nodes... (Sequential & Strict Match)


26/02/23 22:24:04 WARN DAGScheduler: Broadcasting large task binary with size 1379.6 KiB
                                                                                

Writing Citation SubRefs... (Sequential & Strict Match)


26/02/23 22:24:42 WARN DAGScheduler: Broadcasting large task binary with size 1375.9 KiB
[Stage 35:>                                                         (0 + 1) / 1]

Graph load complete!


                                                                                