# Neo4j Parallel Spark Loader Benchmarking

This notebooks evaluates the performance of the `neo4j-parallel-spark-loader` library vs loading serially. 

## Imports

In [7]:
import os
import timeit
from typing import Literal

from pyspark.sql import DataFrame, SparkSession

from neo4j_parallel_spark_loader import bipartite, monopartite, predefined_components, ingest_spark_dataframe
from benchmarking.utils.spark import create_spark_session
from benchmarking.utils.database import *

In [2]:
from benchmarking.utils.results import _get_package_version

print(_get_package_version())

0.2.0


## Create Spark Session

In [2]:
spark_session: SparkSession = create_spark_session()
...

Ivy Default Cache set to: /Users/alexandergilmore/.ivy2/cache
The jars for the packages stored in: /Users/alexandergilmore/.ivy2/jars
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0310085a-87f5-44a4-89fd-f5d8c6974497;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/alexandergilmore/Documents/projects/neo4j-parallel-spark-loader/.venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.neo4j#neo4j-connector-apache-spark_2.12;5.1.0_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.1.0 in central
	found org.neo4j.driver#neo4j-java-driver;4.4.12 in central
	found org.reactivestreams#reactive-streams;1.0.4 in local-m2-cache
	found org.apache.xbean#xbean-asm6-shaded;4.10 in central
	found org.neo4j#neo4j-cypher-dsl;2022.9.0 in central
	found org.apiguardian#apiguardian-api;1.1.2 in local-m2-cache
:: resolution report :: resolve 155ms :: artifacts dl 5ms
	:: modules in use:
	org.apache.xbean#xbean-asm6-shaded;4.10 from central in [default]
	org.apiguardian#apiguardian-api;1.1.2 from local-m2-cache in [default]
	org.neo4j#neo4j-connector-apache-spark_2.12;5.1.0_for_spark_3 from central in [default]
	org.neo4j#neo4j-connector-apache-spark_2.12_common;5.1.0 from central in [default]
	org.neo4j#neo4j-cypher-dsl;2022.9.0 from central in [default]
	org.neo4j.driver#neo4j-java-driver;4.4.12 from central in [default]
	org.reactivestreams

## Load Data

In [14]:
def load_data_into_spark_dataframe(category: Literal["bipartite", "monopartite", "predefined_components"]) -> DataFrame:
    file_path = "data/"+category+"/"
    csv_name = os.listdir(file_path)[0]

    return spark_session.read.option("header", True).csv(file_path+csv_name)

In [15]:
bp_sdf = load_data_into_spark_dataframe("bipartite")

In [None]:
mp_sdf = load_data_into_spark_dataframe("monopartite")

In [None]:
pc_sdf = load_data_into_spark_dataframe("predefined_components")

## Benchmarking

### Set Up Tasks

In [16]:
ingest_functions = {
    "bipartite": {"serial": load_bipartite_relationships_in_serial, 
                  "parallel": load_bipartite_relationships_in_parallel,
                  "nodes": load_bipartite_nodes},
    "monopartite": {"serial": load_monopartite_relationships_in_serial, 
                    "parallel": load_monopartite_relationships_in_parallel,
                    "nodes": load_monopartite_nodes},
    "predefined_components": {"serial": load_bipartite_relationships_in_serial, # using bipartite data
                              "parallel": load_predefined_components_relationships_in_parallel,
                              "nodes": load_bipartite_nodes},
}

In [None]:
sample_sizes = [10, 100, 1_000, 10_000, 100_000]
sdfs = {0: bp_sdf, 2: mp_sdf, 4: pc_sdf}

unsampled_tasks = [
{"graph_structure": "bipartite", "load_strategy": "serial", "num_groups": None},
{"graph_structure": "bipartite", "load_strategy": "parallel", "num_groups": 1},
{"graph_structure": "monopartite", "load_strategy": "serial",  "num_groups": None},
{"graph_structure": "monopartite", "load_strategy": "parallel", "num_groups": 1},
{"graph_structure": "predefined_components", "load_strategy": "serial",  "num_groups": None},
{"graph_structure": "predefined_components", "load_strategy": "parallel", "num_groups": 1},
]

In [17]:
from benchmarking.utils.results import create_results_dataframe, append_results_to_dataframe, generate_benchmark_results, save_dataframe

ImportError: cannot import name 'save_dataframe' from 'benchmarking.utils.results' (/Users/alexandergilmore/Documents/projects/neo4j-parallel-spark-loader/benchmarking/utils/results.py)

### Run Benchmarking

In [13]:
results_df = create_results_dataframe()

In [None]:
counter = 0
for idx in range(0, 2, len(unsampled_tasks)):
    for s in sample_sizes:
        sampled_sdf = sdfs.get(idx).sample(s / 100000.0)

        # create constraints
        create_constraints(spark_session=spark_session)

        graph_structure = unsampled_tasks[idx].get("graph_structure")

        # load nodes
        fn = ingest_functions.get(graph_structure).get("nodes")
        fn(sampled_sdf)

        # load relationships
        load_strategy = unsampled_tasks[idx].get("load_strategy")
        num_groups = unsampled_tasks[idx].get("num_groups")
        # idx
        results_row = generate_benchmark_results(spark_dataframe=sampled_sdf, 
                                                 graph_structure=graph_structure, 
                                                 ingest_function=ingest_functions.get(graph_structure).get(load_strategy),
                                                 num_groups=num_groups if load_strategy == "parallel" else None)
        results_df = append_results_to_dataframe(results_df, results_row)
        
        save_dataframe(results_df)

        # clean up relationships
        delete_relationships(spark_session=spark_session)

        # idx + 1
        load_strategy = unsampled_tasks[idx+1].get("load_strategy")
        num_groups = unsampled_tasks[idx+1].get("num_groups")
        results_row = generate_benchmark_results(spark_dataframe=sampled_sdf, 
                                                 graph_structure=graph_structure, 
                                                 ingest_function=ingest_functions.get(graph_structure).get(load_strategy),
                                                 num_groups=num_groups if load_strategy == "parallel" else None)
        results_df = append_results_to_dataframe(results_df, results_row)

        save_dataframe(results_df)

        # refresh database
        restore_database(spark_session=spark_session)
        
        counter+=1
        print(f"{round(counter / 30.0, 4) * 100}% Complete")
        
        