# Example to Read / Write to Neo4j with Spark

Documentation: https://neo4j.com/developer/spark/

NOTE: Spark dataframe integration for Neo4j, at the time of integration did not have a maven repository setup with the proper version. Therefore we import it manually:

In [1]:
! sudo cp /home/jovyan/work/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar /usr/local/spark/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
# NEO4J  CONFIGURATION
bolt_url = "bolt://neo4j:7687"

In [4]:
# NOT AVAILABLE:      .config("spark.jars.packages","neo4j-contrib:neo4j-connector-apache-spark_2.12:4.1.0")\

# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

21/12/09 16:37:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [43]:
# read local data
df = spark.read.option("multiline","true").json("/home/jovyan/datasets/json-samples/stocks.json")
df.toPandas()

Unnamed: 0,price,symbol
0,126.82,AAPL
1,3098.12,AMZN
2,251.11,FB
3,1725.05,GOOG
4,128.39,IBM
5,212.55,MSFT
6,78.0,NET
7,497.0,NFLX
8,823.8,TSLA
9,45.11,TWTR


In [44]:
# erase the nodes if they exist, with a Cipher query
df.write.format("org.neo4j.spark.DataSource").mode("Overwrite").option("url", bolt_url).option("query","match (s:stocks) delete s").save()

In [51]:
# Write to back to Neo4j as nodes with symbol as the key
df.write.format("org.neo4j.spark.DataSource")\
  .mode("Overwrite")\
  .option("url", bolt_url) \
  .option("labels", "stocks") \
  .option("node.keys","symbol") \
  .save()

In [48]:
# read back from Neo4j
df1 = spark.read.format("org.neo4j.spark.DataSource") \
  .option("url", bolt_url) \
  .option("labels", "stocks") \
  .load()
df1.toPandas()

Unnamed: 0,<id>,<labels>,symbol,price
0,0,[stocks],AAPL,126.82
1,1,[stocks],AMZN,3098.12
2,2,[stocks],FB,251.11
3,3,[stocks],GOOG,1725.05
4,4,[stocks],IBM,128.39
5,5,[stocks],MSFT,212.55
6,6,[stocks],NET,78.0
7,7,[stocks],NFLX,497.0
8,8,[stocks],TSLA,823.8
9,9,[stocks],TWTR,45.11


In [47]:
# We can Execute a Cypher query, too:
cipher_ql = "MATCH (s:stocks) where s.price < 100  RETURN s.symbol, s.price"
df2 = spark.read.format("org.neo4j.spark.DataSource") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .load()
df2.toPandas()

Unnamed: 0,s.symbol,s.price
0,NET,78.0
1,TWTR,45.11


# Full example 

- Table of Pets in shelters
- Make Pets Label, Make Shelters Label
- Make Relationship of Pets Found in Shelter
- 100% Cipher Queries

In [133]:
data = [(1, "Fido", "Dog","SPCA",1),(2, "Felix", "Cat", "SPCA",2),(3, "Rover", "Dog","SPCA",1)]
cols = ["id","name","type","shelter","years_at_shelter"]
pets = spark.createDataFrame(data = data, schema = cols)
pets.toPandas()

Unnamed: 0,id,name,type,shelter,years_at_shelter
0,1,Fido,Dog,SPCA,1
1,2,Felix,Cat,SPCA,2
2,3,Rover,Dog,SPCA,1


In [136]:
# Write Data With a Cipher Query. Note that "event" refers to each row in the data frame
cipher_ql = "MERGE (p:Pets {name: event.name , type: event.type})"
pets.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

cipher_ql = "MERGE (s:Shelters {name: event.shelter})"
pets.select("shelter").distinct().write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

cipher_ql = '''
MATCH (p:Pets {name: event.name}), (s:Shelters {name: event.shelter})
MERGE (p)-[:FOUND_IN {years: event.years_at_shelter}]->(s)
'''
pets.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

In [135]:
# Generic Execution, need a one-row dataframe
cipher_ql = '''
MATCH (n)
DETACH DELETE n
'''
df = spark.createDataFrame(data = [{'row':1}])
df.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()
