In [1]:
# Load dotenv
from dotenv import load_dotenv
import os

load_dotenv()

LEGISLATION_URL_PREFIX = os.getenv('LEGISLATION_URL_PREFIX')
LEGISLATION_URI_LIST_FILE = os.getenv('LEGISLATION_URI_LIST_FILE')
JSON_OUTPUT_DIR = os.getenv('JSON_OUTPUT_DIR', 'json_out')
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE', 'neo4j')

In [2]:
# Initialize pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, DoubleType
from pyspark.sql.window import Window

# Initialize Spark with Neo4j Connector
neo4j_maven_pkg = "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.10_for_spark_3"
spark = (
    SparkSession.builder.appName("PSC_Loader_Spark")
    .config("spark.jars.packages", neo4j_maven_pkg)
    .config("neo4j.url", NEO4J_URI)
    .config("neo4j.authentication.basic.user", NEO4J_USER)
    .config("neo4j.authentication.basic.password", NEO4J_PASSWORD)
    .config("neo4j.database", NEO4J_DATABASE)
    .getOrCreate()
)

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Check Spark and Connector versions
print(f"Spark version: {spark.version}")
print(f"Scala version: {spark.sparkContext.version.split('.')[1]}")
print(f"Neo4j Connector version: {neo4j_maven_pkg.split(':')[2]}")

26/02/21 13:36:36 WARN Utils: Your hostname, Pedros-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.181 instead (on interface en0)
26/02/21 13:36:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/pedroleitao/.ivy2/cache
The jars for the packages stored in: /Users/pedroleitao/.ivy2/jars
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-066e6650-4697-4d95-984d-24c33e30a245;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/pedroleitao/miniconda3/envs/legal-legislation-explorer/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.neo4j#neo4j-connector-apache-spark_2.12;5.3.10_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.3.10_for_spark_3 in central
	found org.neo4j#caniuse-core;1.3.0 in central
	found org.neo4j#caniuse-api;1.3.0 in central
	found org.jetbrains.kotlin#kotlin-stdlib;2.1.20 in central
	found org.jetbrains#annotations;13.0 in central
	found org.neo4j#caniuse-neo4j-detection;1.3.0 in central
	found org.neo4j.driver#neo4j-java-driver-slim;4.4.21 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found io.netty#netty-handler;4.1.127.Final in central
	found io.netty#netty-common;4.1.127.Final in central
	found io.netty#netty-resolver;4.1.127.Final in central
	found io.netty#netty-buffer;4.1.127.Final in central
	found io.netty#netty-transport;4.1.127.Final in central
	found io.netty#netty-transport-native-unix-common;4.1.127.Final in central
	found io.netty#netty-codec;4.1.127.Final in central
	found io.netty#netty-tcnative-classes;2

Spark version: 3.5.1
Scala version: 5
Neo4j Connector version: 5.3.10_for_spark_3


In [3]:
from pyspark.sql.functions import col, explode_outer, concat, lit, coalesce, md5, to_date, regexp_replace

# PySpark Graph Builder
def load_full_hierarchy_to_neo4j(json_dir=f"{JSON_OUTPUT_DIR}/*.json"):
    
    spark = SparkSession.builder \
        .appName("Legislation Full Graph Builder") \
        .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.2_for_spark_3") \
        .config("neo4j.url", NEO4J_URI) \
        .config("neo4j.authentication.basic.username", NEO4J_USER) \
        .config("neo4j.authentication.basic.password", NEO4J_PASSWORD) \
        .getOrCreate()
        
    raw_df = spark.read \
        .option("multiline", "true") \
        .option("mode", "PERMISSIVE") \
        .option("columnNameOfCorruptRecord", "_corrupt_record") \
        .option("recursiveFileLookup", "true") \
        .option("pathGlobFilter", "*.json") \
        .json(json_dir)

    if "_corrupt_record" in raw_df.columns:
        raw_df = raw_df.filter(col("_corrupt_record").isNull()).drop("_corrupt_record")
        
    # Filter out any legislation without a valid URL (since this is our primary key for linking)
    raw_df = raw_df.filter(col("legislation_url").isNotNull() & (col("legislation_url") != ""))
    
    # Root Legislation Nodes
    print("Writing Legislation Nodes...")
    select_exprs = [
        col("legislation_url").alias("uri"),
        col("identifier.title").alias("title"),
        col("identifier.description").alias("description")
    ]
    
    if "identifier" in raw_df.columns and "modified" in raw_df.schema["identifier"].dataType.fieldNames():
        select_exprs.append(to_date(col("identifier.modified"), "yyyy-MM-dd").alias("modified_date"))
    else:
        select_exprs.append(lit(None).cast("date").alias("modified_date"))
        
    if "identifier" in raw_df.columns and "valid_date" in raw_df.schema["identifier"].dataType.fieldNames():
        select_exprs.append(to_date(col("identifier.valid_date"), "yyyy-MM-dd").alias("valid_date"))
    else:
        select_exprs.append(lit(None).cast("date").alias("valid_date"))

    if "metadata" in raw_df.columns and "enactment_date" in raw_df.schema["metadata"].dataType.fieldNames():
        select_exprs.append(to_date(col("metadata.enactment_date"), "yyyy-MM-dd").alias("enactment_date"))
    else:
        select_exprs.append(lit(None).cast("date").alias("enactment_date"))

    legis_df = raw_df.select(*select_exprs).dropDuplicates(["uri"])

    legis_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MERGE (l:Legislation {uri: row.uri})
        SET l.title = row.title, 
            l.description = row.description,
            l.modified_date = row.modified_date,
            l.valid_date = row.valid_date,
            l.enactment_date = row.enactment_date
    """).save()

    # Main Body Hierarchy
    print("Writing Part Nodes...")
    parts_df = raw_df.select(
        col("legislation_url").alias("legis_uri"),
        explode_outer("parts").alias("part")
    ).filter(col("part").isNotNull()) \
     .withColumn("part_num", col("part.part_number")) \
     .withColumn("part_id", concat(col("legis_uri"), lit("#part_"), coalesce(col("part_num"), md5(col("part").cast("string")))))

    parts_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (l:Legislation {uri: row.legis_uri})
        MERGE (p:Part {id: row.part_id})
        SET p.number = row.part_num, p.title = row.part.title
        MERGE (l)-[:HAS_PART]->(p)
    """).save()

    # Chapter Nodes
    print("Writing Chapter Nodes...")
    chapters_df = parts_df.select(
        col("part_id"),
        explode_outer("part.chapters").alias("chapter")
    ).filter(col("chapter").isNotNull()) \
     .withColumn("chapter_num", col("chapter.chapter_number")) \
     .withColumn("chapter_uri", col("chapter.uri")) \
     .withColumn("chapter_id", coalesce(col("chapter_uri"), concat(col("part_id"), lit("#chapter_"), coalesce(col("chapter_num"), md5(col("chapter").cast("string"))))))

    chapters_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (p:Part {id: row.part_id})
        MERGE (c:Chapter {id: row.chapter_id})
        SET c.number = row.chapter_num, 
            c.title = row.chapter.title,
            c.uri = row.chapter_uri
        MERGE (p)-[:HAS_CHAPTER]->(c)
    """).save()

    print("Writing Section Nodes...")
    sections_df = chapters_df.select(
        col("chapter_id"),
        explode_outer("chapter.sections").alias("section")
    ).filter(col("section").isNotNull()) \
     .withColumn("sec_id", coalesce(col("section.uri"), concat(col("chapter_id"), lit("#sec_"), coalesce(col("section.section_number"), md5(col("section").cast("string"))))))
    
    sections_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (c:Chapter {id: row.chapter_id})
        MERGE (s:Section {id: row.sec_id})
        SET s.number = row.section.section_number, 
            s.title = row.section.title, 
            s.uri = row.section.uri
        MERGE (c)-[:HAS_SECTION]->(s)
    """).save()

    print("Writing Paragraph Nodes...")
    paragraphs_df = sections_df.select(
        col("sec_id"),
        explode_outer("section.paragraphs").alias("paragraph")
    ).filter(col("paragraph").isNotNull()) \
     .withColumn("para_id", coalesce(col("paragraph.uri"), concat(col("sec_id"), lit("#para_"), coalesce(col("paragraph.paragraph_number"), md5(col("paragraph").cast("string"))))))

    paragraphs_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
        UNWIND event AS row
        MATCH (s:Section {id: row.sec_id})
        MERGE (pa:Paragraph {id: row.para_id})
        SET pa.number = row.paragraph.paragraph_number, 
            pa.text = row.paragraph.text, 
            pa.uri = row.paragraph.uri
        MERGE (s)-[:HAS_PARAGRAPH]->(pa)
    """).save()

    # Schedules Hierarchy
    sched_para_comm_df = None
    sched_subpara_comm_df = None

    if "schedules" in raw_df.columns:
        print("Writing Schedule Nodes...")
        schedules_df = raw_df.select(
            col("legislation_url").alias("legis_uri"),
            explode_outer("schedules").alias("schedule")
        ).filter(col("schedule").isNotNull()) \
         .withColumn("sched_id", coalesce(col("schedule.uri"), concat(col("legis_uri"), lit("#sched_"), coalesce(col("schedule.schedule_number"), md5(col("schedule").cast("string"))))))
         
        schedules_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (l:Legislation {uri: row.legis_uri})
            MERGE (sc:Schedule {id: row.sched_id})
            SET sc.number = row.schedule.schedule_number,
                sc.title = row.schedule.title,
                sc.reference = row.schedule.reference,
                sc.uri = row.schedule.uri
            MERGE (l)-[:HAS_SCHEDULE]->(sc)
        """).save()

        print("Writing Schedule Paragraph Nodes...")
        sched_paras_df = schedules_df.select(
            col("sched_id"),
            explode_outer("schedule.paragraphs").alias("paragraph")
        ).filter(col("paragraph").isNotNull()) \
         .withColumn("para_id", coalesce(col("paragraph.uri"), concat(col("sched_id"), lit("#spara_"), coalesce(col("paragraph.paragraph_number"), md5(col("paragraph").cast("string"))))))

        sched_paras_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            MATCH (sc:Schedule {id: row.sched_id})
            MERGE (p:ScheduleParagraph {id: row.para_id})
            SET p.number = row.paragraph.paragraph_number,
                p.crossheading = row.paragraph.crossheading,
                p.text = row.paragraph.text,
                p.uri = row.paragraph.uri
            MERGE (sc)-[:HAS_PARAGRAPH]->(p)
        """).save()

        sched_para_comm_df = sched_paras_df.select(col("para_id").alias("parent_id"), explode_outer("paragraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

        if "subparagraphs" in sched_paras_df.schema["paragraph"].dataType.fieldNames():
            print("Writing Schedule Sub-paragraph Nodes...")
            sched_subparas_df = sched_paras_df.select(
                col("para_id"),
                explode_outer("paragraph.subparagraphs").alias("subparagraph")
            ).filter(col("subparagraph").isNotNull()) \
             .withColumn("subpara_id", coalesce(col("subparagraph.uri"), concat(col("para_id"), lit("#ssub_"), coalesce(col("subparagraph.subparagraph_number"), md5(col("subparagraph").cast("string"))))))

            sched_subparas_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
                UNWIND event AS row
                MATCH (p:ScheduleParagraph {id: row.para_id})
                MERGE (sp:ScheduleSubparagraph {id: row.subpara_id})
                SET sp.number = row.subparagraph.subparagraph_number,
                    sp.text = row.subparagraph.text,
                    sp.uri = row.subparagraph.uri
                MERGE (p)-[:HAS_SUBPARAGRAPH]->(sp)
            """).save()

            sched_subpara_comm_df = sched_subparas_df.select(col("subpara_id").alias("parent_id"), explode_outer("subparagraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

    # Commentary Extraction & Linking
    print("Writing Commentary Nodes...")
    
    sec_comm_df = sections_df.select(col("sec_id").alias("parent_id"), explode_outer("section.commentaries").alias("commentary")).filter(col("commentary").isNotNull())
    para_comm_df = paragraphs_df.select(col("para_id").alias("parent_id"), explode_outer("paragraph.commentaries").alias("commentary")).filter(col("commentary").isNotNull())

    # Safely flatten commentary structs before writing to Neo4j
    def write_commentaries(df, parent_label):
        if df is not None:
            fields = df.schema["commentary"].dataType.fieldNames()
            type_col = col("commentary.type") if "type" in fields else lit(None)
            text_col = col("commentary.text") if "text" in fields else lit(None)
            
            flat_df = df.select(
                col("parent_id"),
                col("commentary.ref_id").alias("ref_id"),
                type_col.alias("type"),
                text_col.alias("text")
            ).filter(col("ref_id").isNotNull()).dropDuplicates(["parent_id", "ref_id"])
            
            flat_df.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", f"""
                UNWIND event AS row
                WITH row WHERE row.ref_id IS NOT NULL
                MATCH (parent:{parent_label} {{id: row.parent_id}})
                MERGE (com:Commentary {{id: row.ref_id}})
                SET com.type = row.type, com.text = row.text
                MERGE (parent)-[:HAS_COMMENTARY]->(com)
            """).save()

    write_commentaries(sec_comm_df, "Section")
    write_commentaries(para_comm_df, "Paragraph")
    write_commentaries(sched_para_comm_df, "ScheduleParagraph")
    write_commentaries(sched_subpara_comm_df, "ScheduleSubparagraph")

    # Citations and Cross-Links (Fixed Topology & URIs)
    all_comms = sec_comm_df.select("commentary").unionByName(para_comm_df.select("commentary"), allowMissingColumns=True)
    if sched_para_comm_df is not None:
        all_comms = all_comms.unionByName(sched_para_comm_df.select("commentary"), allowMissingColumns=True)
    if sched_subpara_comm_df is not None:
        all_comms = all_comms.unionByName(sched_subpara_comm_df.select("commentary"), allowMissingColumns=True)

    comm_fields = all_comms.schema["commentary"].dataType.fieldNames()
    
    if "citations" in comm_fields:
        print("Writing Citation Nodes... (Sequential & Strict Match)")
        citations_df = all_comms.select(
            col("commentary.ref_id").alias("comm_id"),
            explode_outer("commentary.citations").alias("citation")
        ).filter(col("citation").isNotNull())
        
        cit_fields = citations_df.schema["citation"].dataType.fieldNames()
        
        citations_flat = citations_df.select(
            col("comm_id"),
            col("citation.id").alias("cit_id") if "id" in cit_fields else lit(None).alias("cit_id"),
            col("citation.uri").alias("cit_uri") if "uri" in cit_fields else lit(None).alias("cit_uri"),
            col("citation.title").alias("cit_title") if "title" in cit_fields else lit(None).alias("cit_title"),
            col("citation.year").alias("cit_year") if "year" in cit_fields else lit(None).alias("cit_year"),
            col("citation.class").alias("cit_class") if "class" in cit_fields else lit(None).alias("cit_class"),
            col("citation.text").alias("cit_text") if "text" in cit_fields else lit(None).alias("cit_text")
        ).filter(col("cit_id").isNotNull()).dropDuplicates(["comm_id", "cit_id"])

        # Strip '/id/' from the URI so it matches our root Legislation nodes perfectly
        citations_flat = citations_flat.withColumn("norm_uri", regexp_replace(col("cit_uri"), r"/id/", "/"))
        citations_flat = citations_flat.coalesce(1)

        citations_flat.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            WITH row WHERE row.comm_id IS NOT NULL AND row.cit_id IS NOT NULL
            
            // Create the Citation Node and link it to the Commentary
            MATCH (com:Commentary {id: row.comm_id})
            MERGE (cit:Citation {id: row.cit_id})
            SET cit.uri = row.cit_uri,
                cit.title = row.cit_title,
                cit.year = row.cit_year,
                cit.class = row.cit_class,
                cit.text = row.cit_text
            MERGE (com)-[:HAS_CITATION]->(cit)
            
            // Strict Match: Only draw CITES_ACT if the target Legislation actually exists
            WITH cit, row
            WHERE row.norm_uri IS NOT NULL
            MATCH (leg:Legislation {uri: row.norm_uri})
            MERGE (cit)-[:CITES_ACT]->(leg)
        """).save()

    if "citation_subrefs" in comm_fields:
        print("Writing Citation SubRefs... (Sequential & Strict Match)")
        subrefs_df = all_comms.select(
            col("commentary.ref_id").alias("comm_id"),
            explode_outer("commentary.citation_subrefs").alias("subref")
        ).filter(col("subref").isNotNull())
        
        sub_fields = subrefs_df.schema["subref"].dataType.fieldNames()

        subrefs_flat = subrefs_df.select(
            col("comm_id"),
            col("subref.id").alias("sub_id") if "id" in sub_fields else lit(None).alias("sub_id"),
            col("subref.citation_ref").alias("citation_ref") if "citation_ref" in sub_fields else lit(None).alias("citation_ref"),
            col("subref.uri").alias("sub_uri") if "uri" in sub_fields else lit(None).alias("sub_uri"),
            col("subref.section_ref").alias("sub_section_ref") if "section_ref" in sub_fields else lit(None).alias("sub_section_ref"),
            col("subref.text").alias("sub_text") if "text" in sub_fields else lit(None).alias("sub_text")
        ).filter(col("sub_id").isNotNull()).dropDuplicates(["comm_id", "sub_id"])

        # Extract the Base URI from the deep SubRef URI (e.g., .../ukpga/2009/26/section/55/1 -> .../ukpga/2009/26)
        # This allows us to map the subref directly to the parent Act node
        subrefs_flat = subrefs_flat.withColumn("base_uri", 
            regexp_replace(col("sub_uri"), r"(http://www\.legislation\.gov\.uk)/id/([^/]+/[0-9]+/[0-9]+).*", "$1/$2")
        )
        subrefs_flat = subrefs_flat.coalesce(1)

        subrefs_flat.write.format("org.neo4j.spark.DataSource").mode("Append").option("query", """
            UNWIND event AS row
            WITH row WHERE row.comm_id IS NOT NULL AND row.sub_id IS NOT NULL
            
            // Create the SubRef node
            MERGE (sub:CitationSubRef {id: row.sub_id})
            SET sub.uri = row.sub_uri, 
                sub.section_ref = row.sub_section_ref, 
                sub.text = row.sub_text
                
            // Conditional Link: Link to Citation if citation_ref exists, otherwise link to Commentary
            WITH sub, row
            MATCH (com:Commentary {id: row.comm_id})
            OPTIONAL MATCH (cit:Citation {id: row.citation_ref})
            
            FOREACH (_ IN CASE WHEN cit IS NOT NULL THEN [1] ELSE [] END |
                MERGE (cit)-[:HAS_SUBREF]->(sub)
            )
            FOREACH (_ IN CASE WHEN cit IS NULL THEN [1] ELSE [] END |
                MERGE (com)-[:HAS_SUBREF]->(sub)
            )
            
            // Strict Match: Link SubRef to the base Legislation Act if it exists
            WITH sub, row
            WHERE row.base_uri IS NOT NULL
            MATCH (leg:Legislation {uri: row.base_uri})
            MERGE (sub)-[:REFERENCES]->(leg)
        """).save()

    print("Graph load complete!")

In [4]:
from neo4j import GraphDatabase

def setup_neo4j_constraints(uri, user, password, database):
    """
    Connects directly to Neo4j to ensure unique constraints and indexes exist 
    before Spark starts pushing data. This prevents duplicate nodes and makes MERGE fast.
    """
    print("Setting up Neo4j constraints...")
    constraints = [
        "CREATE CONSTRAINT leg_uri_unique IF NOT EXISTS FOR (l:Legislation) REQUIRE l.uri IS UNIQUE;",
        "CREATE CONSTRAINT part_id_unique IF NOT EXISTS FOR (p:Part) REQUIRE p.id IS UNIQUE;",
        "CREATE CONSTRAINT chap_id_unique IF NOT EXISTS FOR (c:Chapter) REQUIRE c.id IS UNIQUE;",
        "CREATE CONSTRAINT sec_id_unique IF NOT EXISTS FOR (s:Section) REQUIRE s.id IS UNIQUE;",
        "CREATE CONSTRAINT para_id_unique IF NOT EXISTS FOR (pa:Paragraph) REQUIRE pa.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_id_unique IF NOT EXISTS FOR (s:Schedule) REQUIRE s.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_para_id_unique IF NOT EXISTS FOR (p:ScheduleParagraph) REQUIRE p.id IS UNIQUE;",
        "CREATE CONSTRAINT sched_subpara_id_unique IF NOT EXISTS FOR (sp:ScheduleSubparagraph) REQUIRE sp.id IS UNIQUE;",
        "CREATE CONSTRAINT com_id_unique IF NOT EXISTS FOR (com:Commentary) REQUIRE com.id IS UNIQUE;",
        "CREATE CONSTRAINT cit_id_unique IF NOT EXISTS FOR (cit:Citation) REQUIRE cit.id IS UNIQUE;", # NEW
        "CREATE CONSTRAINT sub_id_unique IF NOT EXISTS FOR (sub:CitationSubRef) REQUIRE sub.id IS UNIQUE;"
    ]
    
    driver = GraphDatabase.driver(uri, auth=(user, password))
    with driver.session(database=database) as session:
        for query in constraints:
            session.run(query)
    driver.close()
    print("Constraints successfully applied.\n")

In [5]:
setup_neo4j_constraints(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_DATABASE)
load_full_hierarchy_to_neo4j(json_dir=JSON_OUTPUT_DIR)

Setting up Neo4j constraints...
Constraints successfully applied.



26/02/21 13:37:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Writing Legislation Nodes...


                                                                                

Writing Part Nodes...


                                                                                

Writing Chapter Nodes...


                                                                                

Writing Section Nodes...


                                                                                

Writing Paragraph Nodes...


                                                                                

Writing Schedule Nodes...


                                                                                

Writing Schedule Paragraph Nodes...


                                                                                

Writing Schedule Sub-paragraph Nodes...


                                                                                

Writing Commentary Nodes...


                                                                                

Writing Citation Nodes... (Sequential & Strict Match)


26/02/21 13:39:23 WARN DAGScheduler: Broadcasting large task binary with size 1350.7 KiB
                                                                                

Writing Citation SubRefs... (Sequential & Strict Match)


26/02/21 13:39:36 WARN DAGScheduler: Broadcasting large task binary with size 1344.6 KiB
[Stage 29:>                                                         (0 + 1) / 1]

Graph load complete!


                                                                                