In [1]:
# Load dotenv
from dotenv import load_dotenv
import os

load_dotenv()

LEGISLATION_URL_PREFIX = os.getenv('LEGISLATION_URL_PREFIX')
LEGISLATION_URI_LIST_FILE = os.getenv('LEGISLATION_URI_LIST_FILE')
JSON_OUTPUT_DIR = os.getenv('JSON_OUTPUT_DIR', 'json_out')
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')

In [None]:
# Initialize pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, DoubleType
from pyspark.sql.window import Window

# Initialize Spark with Neo4j Connector
neo4j_maven_pkg = "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.10_for_spark_3"
spark = (
    SparkSession.builder.appName("PSC_Loader_Spark")
    .config("spark.jars.packages", neo4j_maven_pkg)
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.user", NEO4J_USER)
    .config("neo4j.authentication.basic.password", NEO4J_PASSWORD)
    .config("neo4j.database", NEO4J_DATABASE)
    .getOrCreate()
)

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Check Spark and Connector versions
print(f"Spark version: {spark.version}")
print(f"Scala version: {spark.sparkContext.version.split('.')[1]}")
print(f"Neo4j Connector version: {neo4j_maven_pkg.split(':')[2]}")

Ivy Default Cache set to: /Users/pedroleitao/.ivy2/cache
The jars for the packages stored in: /Users/pedroleitao/.ivy2/jars
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ac8bec2d-f51f-44df-a998-2529e9716c9f;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.12;5.3.10_for_spark_3 in central


:: loading settings :: url = jar:file:/Volumes/Home/pedroleitao/miniconda3/envs/legal-legislation-explorer/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.127.Final in central
	found io.netty#netty-common;4.1.127.Final in central
	found io.netty#netty-resolver;4.1.127.Final in central
	found io.netty#netty-buffer;4.1.127.Final in central
	found io.netty#netty-transport;4.1.127.Final in central
	found io.netty#netty-transport-native-unix-common;4.1.127.Final in central
	found io.netty#netty-codec;4.1.127.Final in central
	found io.netty#netty-tcnative-classes;2.0.73.Final in central
	found io.projectreactor#reactor-core;3.6.11 in central
	f

Spark version: 3.5.1
Scala version: 5
Neo4j Connector version: 5.3.10_for_spark_3


In [None]:
from pyspark.sql.functions import col, explode_outer, concat, lit, coalesce, md5, to_date, to_timestamp

def load_full_hierarchy_to_neo4j(json_dir=f"{JSON_OUTPUT_DIR}/*.json"):
    
    # Read the multi-line JSON files
    raw_df = spark.read.option("multiline", "true").json(json_dir)
    
    # Legislation Nodes (With Date Conversions)
    print("Writing Legislation Nodes...")
    legis_df = raw_df.select(
        col("legislation_url").alias("uri"),
        col("identifier.title").alias("title"),
        col("identifier.description").alias("description"),
        to_date(col("identifier.modified"), "yyyy-MM-dd").alias("modified_date"),
        to_date(col("identifier.valid_date"), "yyyy-MM-dd").alias("valid_date"),
        to_date(col("metadata.enactment_date"), "yyyy-MM-dd").alias("enactment_date")
    ).dropDuplicates(["uri"])

    legis_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MERGE (l:Legislation {uri: row.uri})
        SET l.title = row.title, 
            l.description = row.description,
            l.modified_date = row.modified_date,
            l.valid_date = row.valid_date,
            l.enactment_date = row.enactment_date
    """).save()

        
    # Part Nodes
    print("Writing Part Nodes...")
    parts_df = raw_df.select(
        col("legislation_url").alias("legis_uri"),
        explode_outer("parts").alias("part")
    ).filter(col("part").isNotNull()) \
     .withColumn("part_num", col("part.part_number")) \
     .withColumn("part_title", col("part.title")) \
     .withColumn("part_id", concat(col("legis_uri"), lit("#part_"), coalesce(col("part_num"), md5(col("part").cast("string")))))

    parts_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (l:Legislation {uri: row.legis_uri})
        MERGE (p:Part {id: row.part_id})
        SET p.number = row.part_num, p.title = row.part_title
        MERGE (l)-[:HAS_PART]->(p)
    """).save()

    # Chapter Nodes
    print("Writing Chapter Nodes...")
    chapters_df = parts_df.select(
        col("part_id"),
        explode_outer("part.chapters").alias("chapter")
    ).filter(col("chapter").isNotNull()) \
     .withColumn("chapter_num", col("chapter.chapter_number")) \
     .withColumn("chapter_title", col("chapter.title")) \
     .withColumn("chapter_id", concat(col("part_id"), lit("#chapter_"), coalesce(col("chapter_num"), md5(col("chapter").cast("string")))))

    chapters_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (p:Part {id: row.part_id})
        MERGE (c:Chapter {id: row.chapter_id})
        SET c.number = row.chapter_num, c.title = row.chapter_title
        MERGE (p)-[:HAS_CHAPTER]->(c)
    """).save()

    # Section Nodes (With Safe Date Conversions)
    print("Writing Section Nodes...")
    sections_df = chapters_df.select(
        col("chapter_id"),
        explode_outer("chapter.sections").alias("section")
    ).filter(col("section").isNotNull()) \
     .withColumn("sec_num", col("section.section_number")) \
     .withColumn("sec_title", col("section.title")) \
     .withColumn("sec_uri", col("section.uri")) \
     .withColumn("sec_id", coalesce(col("section.uri"), concat(col("chapter_id"), lit("#sec_"), coalesce(col("sec_num"), md5(col("section").cast("string"))))))
    
    # Check if 'valid_start_date' exists in the inferred 'section' struct
    section_schema_fields = sections_df.schema["section"].dataType.fieldNames()
    if "valid_start_date" in section_schema_fields:
        sections_df = sections_df.withColumn("valid_start_date", to_date(col("section.valid_start_date"), "yyyy-MM-dd"))
    else:
        sections_df = sections_df.withColumn("valid_start_date", lit(None).cast("date"))

    sections_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (c:Chapter {id: row.chapter_id})
        MERGE (s:Section {id: row.sec_id})
        SET s.number = row.sec_num, 
            s.title = row.sec_title, 
            s.uri = row.sec_uri,
            s.valid_start_date = row.valid_start_date
        MERGE (c)-[:HAS_SECTION]->(s)
    """).save()

    # Paragraph Nodes (With Safe Date Conversions)
    print("Writing Paragraph Nodes...")
    paragraphs_df = sections_df.select(
        col("sec_id"),
        explode_outer("section.paragraphs").alias("paragraph")
    ).filter(col("paragraph").isNotNull()) \
     .withColumn("para_num", col("paragraph.paragraph_number")) \
     .withColumn("para_text", col("paragraph.text")) \
     .withColumn("para_uri", col("paragraph.uri")) \
     .withColumn("para_id", coalesce(col("paragraph.uri"), concat(col("sec_id"), lit("#para_"), coalesce(col("para_num"), md5(col("paragraph").cast("string"))))))
    
    # Check if 'valid_start_date' exists in the inferred 'paragraph' struct
    paragraph_schema_fields = paragraphs_df.schema["paragraph"].dataType.fieldNames()
    if "valid_start_date" in paragraph_schema_fields:
        paragraphs_df = paragraphs_df.withColumn("valid_start_date", to_date(col("paragraph.valid_start_date"), "yyyy-MM-dd"))
    else:
        paragraphs_df = paragraphs_df.withColumn("valid_start_date", lit(None).cast("date"))

    paragraphs_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (s:Section {id: row.sec_id})
        MERGE (pa:Paragraph {id: row.para_id})
        SET pa.number = row.para_num, 
            pa.text = row.para_text, 
            pa.uri = row.para_uri,
            pa.valid_start_date = row.valid_start_date
        MERGE (s)-[:HAS_PARAGRAPH]->(pa)
    """).save()

    # Commentary Nodes
    print("Writing Commentary Nodes...")
    
    # 6a. Commentaries linked to Sections
    sec_comm_df = sections_df.select(
        col("sec_id").alias("parent_id"),
        explode_outer("section.commentaries").alias("commentary")
    ).filter(col("commentary").isNotNull())
    
    sec_comm_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        // FIREWALL: Skip any row that managed to pass through with a null ID
        WITH row WHERE row.commentary.ref_id IS NOT NULL
        
        MATCH (s:Section {id: row.parent_id})
        MERGE (com:Commentary {id: row.commentary.ref_id})
        SET com.type = row.commentary.type, com.text = row.commentary.text
        MERGE (s)-[:HAS_COMMENTARY]->(com)
    """).save()

    # 6b. Commentaries linked to Paragraphs
    para_comm_df = paragraphs_df.select(
        col("para_id").alias("parent_id"),
        explode_outer("paragraph.commentaries").alias("commentary")
    ).filter(col("commentary").isNotNull())

    para_comm_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        WITH row WHERE row.commentary.ref_id IS NOT NULL
        
        MATCH (pa:Paragraph {id: row.parent_id})
        MERGE (com:Commentary {id: row.commentary.ref_id})
        SET com.type = row.commentary.type, com.text = row.commentary.text
        MERGE (pa)-[:HAS_COMMENTARY]->(com)
    """).save()

    # Combine all commentaries for downstream Citation and SubRef processing
    all_commentaries_df = sec_comm_df.unionByName(para_comm_df).select("commentary")

    # Citation Nodes & Edges
    print("Writing Citation Nodes...")
    citations_df = all_commentaries_df.select(
        col("commentary.ref_id").alias("comm_id"),
        explode_outer("commentary.citations").alias("citation")
    ).filter(col("citation").isNotNull())

    citations_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        WITH row WHERE row.comm_id IS NOT NULL AND row.citation.uri IS NOT NULL
        
        MATCH (com:Commentary {id: row.comm_id})
        MERGE (cit:Legislation {uri: row.citation.uri})
        ON CREATE SET cit.title = row.citation.title, cit.year = row.citation.year, cit.class = row.citation.class
        MERGE (com)-[:CITES]->(cit)
    """).save()

    # Citation SubRef Nodes & Edges
    print("Writing Citation SubRefs...")
    subrefs_df = all_commentaries_df.select(
        col("commentary.ref_id").alias("comm_id"),
        explode_outer("commentary.citation_subrefs").alias("subref")
    ).filter(col("subref").isNotNull())

    subrefs_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        WITH row WHERE row.comm_id IS NOT NULL AND row.subref.id IS NOT NULL
        
        MATCH (com:Commentary {id: row.comm_id})
        MERGE (sub:CitationSubRef {id: row.subref.id})
        SET sub.uri = row.subref.uri, 
            sub.section_ref = row.subref.section_ref, 
            sub.text = row.subref.text
        MERGE (com)-[:HAS_SUBREF]->(sub)
        
        WITH sub, row
        WHERE row.subref.uri IS NOT NULL
        MERGE (leg:Legislation {uri: row.subref.uri})
        MERGE (sub)-[:REFERENCES]->(leg)
    """).save()

    print("Graph load complete!")

In [4]:
from neo4j import GraphDatabase

def setup_neo4j_constraints(uri, user, password, database):
    """
    Connects directly to Neo4j to ensure unique constraints exist 
    before Spark starts pushing data.
    """
    print("Setting up Neo4j constraints...")
    constraints = [
        "CREATE CONSTRAINT leg_uri_unique IF NOT EXISTS FOR (l:Legislation) REQUIRE l.uri IS UNIQUE;",
        "CREATE CONSTRAINT part_id_unique IF NOT EXISTS FOR (p:Part) REQUIRE p.id IS UNIQUE;",
        "CREATE CONSTRAINT chap_id_unique IF NOT EXISTS FOR (c:Chapter) REQUIRE c.id IS UNIQUE;",
        "CREATE CONSTRAINT sec_id_unique IF NOT EXISTS FOR (s:Section) REQUIRE s.id IS UNIQUE;",
        "CREATE CONSTRAINT para_id_unique IF NOT EXISTS FOR (pa:Paragraph) REQUIRE pa.id IS UNIQUE;",
        "CREATE CONSTRAINT com_id_unique IF NOT EXISTS FOR (com:Commentary) REQUIRE com.id IS UNIQUE;",
        "CREATE CONSTRAINT sub_id_unique IF NOT EXISTS FOR (sub:CitationSubRef) REQUIRE sub.id IS UNIQUE;"
    ]
    
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session(database=database) as session:
        for query in constraints:
            session.run(query)
    driver.close()
    print("Constraints successfully applied.")

In [5]:
setup_neo4j_constraints(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)
load_full_hierarchy_to_neo4j()

Setting up Neo4j constraints...
Constraints successfully applied.


                                                                                

Writing Legislation Nodes...


                                                                                

Writing Part Nodes...


                                                                                

Writing Chapter Nodes...


                                                                                

Writing Section Nodes...


                                                                                

Writing Paragraph Nodes...


                                                                                

Writing Commentary Nodes...


                                                                                

Writing Citation Nodes...


                                                                                

Writing Citation SubRefs...




Graph load complete!


                                                                                