# Neo4j Parallel Spark Loader Benchmarking

This notebooks evaluates the performance of the `neo4j-parallel-spark-loader` library vs loading serially. 

## Imports

In [1]:
import os
import timeit
from typing import Literal

from pyspark.sql import DataFrame, SparkSession

from neo4j_parallel_spark_loader import bipartite, monopartite, predefined_components, ingest_spark_dataframe
from benchmarking.utils.spark import create_spark_session
from benchmarking.utils.database import *

## Create Spark Session

In [2]:
spark_session: SparkSession = create_spark_session()

Ivy Default Cache set to: /Users/alexandergilmore/.ivy2/cache
The jars for the packages stored in: /Users/alexandergilmore/.ivy2/jars
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ec73106a-7a09-4927-94cc-0c5f64a8b167;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/alexandergilmore/Documents/projects/neo4j-parallel-spark-loader/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.neo4j#neo4j-connector-apache-spark_2.12;5.1.0_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.1.0 in central
	found org.neo4j.driver#neo4j-java-driver;4.4.12 in central
	found org.reactivestreams#reactive-streams;1.0.4 in local-m2-cache
	found org.apache.xbean#xbean-asm6-shaded;4.10 in central
	found org.neo4j#neo4j-cypher-dsl;2022.9.0 in central
	found org.apiguardian#apiguardian-api;1.1.2 in local-m2-cache
:: resolution report :: resolve 140ms :: artifacts dl 5ms
	:: modules in use:
	org.apache.xbean#xbean-asm6-shaded;4.10 from central in [default]
	org.apiguardian#apiguardian-api;1.1.2 from local-m2-cache in [default]
	org.neo4j#neo4j-connector-apache-spark_2.12;5.1.0_for_spark_3 from central in [default]
	org.neo4j#neo4j-connector-apache-spark_2.12_common;5.1.0 from central in [default]
	org.neo4j#neo4j-cypher-dsl;2022.9.0 from central in [default]
	org.neo4j.driver#neo4j-java-driver;4.4.12 from central in [default]
	org.reactivestreams

## Load Data

In [3]:
def load_data_into_spark_dataframe(category: Literal["bipartite", "monopartite", "predefined_components"]) -> DataFrame:
    file_path = f"data/{category}_data.csv"
    # csv_name = os.listdir(file_path)[0]

    return spark_session.read.option("header", True).csv(file_path)

In [4]:
bp_sdf = load_data_into_spark_dataframe("bipartite")

In [5]:
bp_sdf.sparkSession.conf.get("neo4j.url")

'neo4j://localhost:7687'

In [6]:
mp_sdf = load_data_into_spark_dataframe("monopartite")

In [7]:
pc_sdf = load_data_into_spark_dataframe("predefined_components")

## Benchmarking

### Set Up Tasks

In [8]:
ingest_functions = {
    "bipartite": {"serial": load_bipartite_relationships_in_serial, 
                  "parallel": load_bipartite_relationships_in_parallel,
                  "nodes": load_bipartite_nodes},
    "monopartite": {"serial": load_monopartite_relationships_in_serial, 
                    "parallel": load_monopartite_relationships_in_parallel,
                    "nodes": load_monopartite_nodes},
    "predefined_components": {"serial": load_bipartite_relationships_in_serial, # using bipartite data
                              "parallel": load_predefined_components_relationships_in_parallel,
                              "nodes": load_bipartite_nodes},
}

In [None]:
sample_sizes = [10, 100, 1_000, 10_000, 100_000]
sample_fractions = [0.0001, 0.001, 0.01, 0.1, 1.0]

sdfs = {0: bp_sdf, 2: mp_sdf, 4: pc_sdf}

unsampled_tasks = [
{"graph_structure": "bipartite", "load_strategy": "serial", "num_groups": None},
{"graph_structure": "bipartite", "load_strategy": "parallel", "num_groups": 3},
{"graph_structure": "monopartite", "load_strategy": "serial",  "num_groups": None},
{"graph_structure": "monopartite", "load_strategy": "parallel", "num_groups": 5},
{"graph_structure": "predefined_components", "load_strategy": "serial",  "num_groups": None},
{"graph_structure": "predefined_components", "load_strategy": "parallel", "num_groups": 3},
]

In [10]:
from benchmarking.utils.results import create_results_dataframe, append_results_to_dataframe, generate_benchmark_results, save_dataframe

### Run Benchmarking

In [11]:
from benchmarking.utils.healthcheck import healthcheck
from benchmarking.utils.neo4j_driver import create_neo4j_driver

In [12]:
neo4j_driver = create_neo4j_driver()

In [13]:
results_df = create_results_dataframe()

In [14]:
from datetime import datetime

ts = str(datetime.now())

counter = 0
for idx in range(0, len(unsampled_tasks), 2):
    print(unsampled_tasks[idx].get("graph_structure"))
    for s in sample_sizes:
        sampled_sdf = sdfs.get(idx).sample(s / 100000.0)

        # create constraints
        # create_constraints(spark_session=spark_session)

        graph_structure = unsampled_tasks[idx].get("graph_structure")

        # load nodes
        fn = ingest_functions.get(graph_structure).get("nodes")
        fn(sampled_sdf)

        # load relationships
        load_strategy = unsampled_tasks[idx].get("load_strategy")
        num_groups = unsampled_tasks[idx].get("num_groups")
        # idx
        results_row = generate_benchmark_results(spark_dataframe=sampled_sdf, 
                                                 graph_structure=graph_structure, 
                                                 ingest_function=ingest_functions.get(graph_structure).get(load_strategy),
                                                 load_strategy=load_strategy,
                                                 num_groups=num_groups if load_strategy == "parallel" else None)
        results_df = append_results_to_dataframe(results_df, results_row)
        
        save_dataframe(results_df, ts)

        # clean up relationships
        delete_relationships(spark_session=spark_session)

        # idx + 1
        load_strategy = unsampled_tasks[idx+1].get("load_strategy")
        num_groups = unsampled_tasks[idx+1].get("num_groups")
        results_row = generate_benchmark_results(spark_dataframe=sampled_sdf, 
                                                 graph_structure=graph_structure, 
                                                 ingest_function=ingest_functions.get(graph_structure).get(load_strategy),
                                                 load_strategy=load_strategy,
                                                 num_groups=num_groups if load_strategy == "parallel" else None)
        results_df = append_results_to_dataframe(results_df, results_row)

        save_dataframe(results_df, ts)

        # refresh database
        restore_database(neo4j_driver=neo4j_driver)

        healthcheck(neo4j_driver=neo4j_driver)
        
        counter+=1
        print(f"{round(counter / 30.0, 4) * 100}% Complete")
        

neo4j_driver.close()
        

bipartite


  return pd.concat([target_dataframe, pd.DataFrame([new_data])], ignore_index=False)
                                                                                


Waiting for Neo4j to Start...

3.3300000000000005% Complete

Waiting for Neo4j to Start...

6.67% Complete

Waiting for Neo4j to Start...

10.0% Complete


                                                                                


Waiting for Neo4j to Start...

13.33% Complete


                                                                                


Waiting for Neo4j to Start...

16.669999999999998% Complete
monopartite

Waiting for Neo4j to Start...

20.0% Complete

Waiting for Neo4j to Start...

23.330000000000002% Complete


                                                                                


Waiting for Neo4j to Start...

26.669999999999998% Complete


                                                                                


Waiting for Neo4j to Start...

30.0% Complete


                                                                                


Waiting for Neo4j to Start...

33.33% Complete
predefined_components

Waiting for Neo4j to Start...

36.67% Complete

Waiting for Neo4j to Start...

40.0% Complete

Waiting for Neo4j to Start...

43.33% Complete

Waiting for Neo4j to Start...

46.67% Complete


                                                                                


Waiting for Neo4j to Start...

50.0% Complete
