In [1]:
from pyspark.sql import SparkSession
import requests
from io import BytesIO
from zipfile import ZipFile
from neo4j_parallel_spark_loader.monopartite import group_and_batch_spark_dataframe
from neo4j_parallel_spark_loader import ingest_spark_dataframe


StatementMeta(medium, 18, 2, Finished, Available, Finished)

## Create spark session
Update the values below for your environment

In [2]:
username = "NEO4J_USER"
password = "NEO4J_PASSWORD"
url = "NEO4J_URL"
dbname = "NEO4J_DATABASE"
spark_executor_count=5

spark = (
    SparkSession.builder
    .appName("TwitchGamers")
    .config("neo4j.url", url)
    .config("url", url)
    .config("neo4j.authentication.basic.username", username)
    .config("neo4j.authentication.basic.password", password)
    .config("neo4j.database", dbname)
    .getOrCreate()
)

StatementMeta(medium, 18, 3, Finished, Available, Finished)

## Download data

In [3]:
# Download the ZIP file
response = requests.get("https://snap.stanford.edu/data/twitch_gamers.zip")
zip_file = ZipFile(BytesIO(response.content))

# Read the CSV file directly from the ZIP
with zip_file.open("large_twitch_edges.csv") as file:
    # Convert to string buffer for Spark to read
    content = file.read().decode('utf-8')
    
    # Create RDD from content
    rdd = spark.sparkContext.parallelize(content.splitlines())
    
    # Convert RDD to DataFrame
    twitch_df = spark.read.csv(rdd, header=True)

# Now df is your Spark DataFrame containing the data
# You can verify the data
twitch_df.show()
twitch_df.printSchema()

StatementMeta(medium, 18, 4, Finished, Available, Finished)

+------------+------------+
|numeric_id_1|numeric_id_2|
+------------+------------+
|       98343|      141493|
|       98343|       58736|
|       98343|      140703|
|       98343|      151401|
|       98343|      157118|
|       98343|      125430|
|       98343|        3635|
|       98343|         495|
|       98343|      116648|
|       98343|        1679|
|       98343|      123861|
|       98343|       89631|
|       98343|      113417|
|       98343|      145281|
|       98343|       10408|
|       98343|        3181|
|       98343|       40675|
|       98343|       95914|
|       98343|      155127|
|       98343|      124827|
+------------+------------+
only showing top 20 rows

root
 |-- numeric_id_1: string (nullable = true)
 |-- numeric_id_2: string (nullable = true)



In [5]:
twitch_df.count()

StatementMeta(medium, 18, 6, Finished, Available, Finished)

6797557

## Load nodes

In [8]:
node_df = (twitch_df
            .select('numeric_id_1')
            .withColumnRenamed('numeric_id_1', 'nodeId')
            .union(
                twitch_df
                .select('numeric_id_2')
                .withColumnRenamed('numeric_id_2', 'nodeId'))
                .dropDuplicates())
node_df.count()

StatementMeta(medium, 18, 9, Finished, Available, Finished)

168114

In [10]:
(
    node_df.write
    .format("org.neo4j.spark.DataSource")
    .mode("Overwrite")
    .option("labels", ":Node")
    .option("node.keys", "nodeId:id")
    .option("schema.optimization.node.keys", "KEY")
    .save()
)

StatementMeta(medium, 18, 11, Finished, Available, Finished)

## Load rels

In [13]:
rel_batch_df = group_and_batch_spark_dataframe(spark_dataframe=twitch_df, 
                                               source_col='numeric_id_1', 
                                               target_col='numeric_id_2', 
                                               num_groups=(2 * spark_executor_count)-1)

StatementMeta(medium, 18, 14, Finished, Available, Finished)

In [14]:
rel_batch_df.show()

StatementMeta(medium, 18, 15, Finished, Available, Finished)

+------------+------------+------+-----+
|numeric_id_1|numeric_id_2| group|batch|
+------------+------------+------+-----+
|      111207|      159752|1 -- 5|    3|
|      111207|      145773|1 -- 5|    3|
|      111207|       42059|1 -- 6|    8|
|      111207|       29852|1 -- 7|    4|
|      111207|       75443|1 -- 1|    1|
|      111207|       12216|1 -- 2|    6|
|      111207|      123168|0 -- 1|    5|
|      111207|       40798|1 -- 1|    1|
|      111207|       52070|0 -- 1|    5|
|      111207|      118432|1 -- 3|    2|
|      111207|      140508|1 -- 2|    6|
|      111207|       78550|1 -- 8|    0|
|      111207|       76995|1 -- 2|    6|
|      111207|       73856|1 -- 3|    2|
|      111207|      154308|1 -- 2|    6|
|      111207|      129693|0 -- 1|    5|
|      111207|       63938|1 -- 1|    1|
|      111207|       21381|1 -- 4|    7|
|      111207|       28488|1 -- 2|    6|
|      111207|       13564|1 -- 1|    1|
+------------+------------+------+-----+
only showing top

In [15]:
query = """
    MATCH(source:Node {id: event.numeric_id_1})
    MATCH(target:Node {id: event.numeric_id_2})
    MERGE(source)-[r:RELATES_TO]->(target)
    """

ingest_spark_dataframe(
    spark_dataframe=rel_batch_df,
    save_mode= "Overwrite",
    options={"query":query}
)

StatementMeta(medium, 18, 16, Finished, Available, Finished)

## Delete rels

In [19]:
rel_count = twitch_df.count()
batch_count = rel_count // 10000 + 1
print(rel_count, batch_count)

StatementMeta(medium, 18, 20, Finished, Available, Finished)

6797557 680


In [24]:
from pyspark.sql.functions import lit
del_df = (spark.range(batch_count)
    .select(lit(1).alias("id")))
print(del_df.count())

StatementMeta(medium, 18, 25, Finished, Available, Finished)

680


In [26]:
del_query = "MATCH ()-[r:RELATES_TO]->() WITH r LIMIT 10000 DELETE r"

(
    del_df.coalesce(1).write
    .format("org.neo4j.spark.DataSource")
    .mode("Overwrite")
    .option("query", del_query)
    .option("batch.size", 1)
    .save()
)

StatementMeta(medium, 18, 27, Finished, Available, Finished)

## Load rels serially

In [27]:
(
    twitch_df.coalesce(1).write
    .format("org.neo4j.spark.DataSource")
    .mode("Overwrite")
    .option("query", query)
    .save()
)

StatementMeta(medium, 18, 28, Finished, Available, Finished)