# Neo4j Parallel Spark Loader Benchmarking

This notebooks evaluates the performance of the `neo4j-parallel-spark-loader` library vs loading serially. 

## Imports

In [11]:
import os
from typing import Literal

from pyspark.sql import DataFrame, SparkSession

from neo4j_parallel_spark_loader import bipartite, monopartite, predefined_components, ingest_spark_dataframe
from benchmarking.utils.spark import create_spark_session

## Create Spark Session

In [10]:
spark_session: SparkSession = create_spark_session()
...

## Load Data

In [14]:
def load_data_into_spark_dataframe(category: Literal["bipartite", "monopartite", "predefined_components"]) -> DataFrame:
    file_path = "data/"+category+"/"
    csv_name = os.listdir(file_path)[0]

    return spark_session.read.option("header", True).csv(file_path+csv_name)

In [15]:
bp_sdf = load_data_into_spark_dataframe("bipartite")

In [16]:
bp_sdf.show()

+-----------------+-----------------+------+------+
|relationship_prop|relationship_type|source|target|
+-----------------+-----------------+------+------+
|           prop_b|            REL_A|     a|     e|
|           prop_d|            REL_C|     b|     d|
|           prop_c|            REL_B|     b|     d|
|           prop_b|            REL_C|     a|     e|
|           prop_c|            REL_B|     c|     d|
|           prop_c|            REL_A|     a|     d|
|           prop_b|            REL_C|     c|     e|
|           prop_a|            REL_C|     b|     d|
|           prop_d|            REL_C|     a|     f|
|           prop_d|            REL_B|     c|     d|
|           prop_c|            REL_C|     c|     f|
|           prop_d|            REL_B|     a|     f|
|           prop_e|            REL_A|     a|     e|
|           prop_b|            REL_A|     b|     f|
|           prop_a|            REL_A|     a|     d|
|           prop_c|            REL_C|     c|     e|
|           