# Unit I
# Graph Database Model

- Examples From Video Lecture 

NOTE: Spark dataframe integration for Neo4j, at the time of integration did not have a maven repository setup with the proper version. Therefore we import it manually:

In [1]:
! sudo cp /home/jovyan/work/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar /usr/local/spark/jars/neo4j-connector-apache-spark_2.12-4.1.0_for_spark_3.jar

In [1]:
import pyspark
from pyspark.sql import SparkSession
# NEO4J  CONFIGURATION
bolt_url = "bolt://neo4j:7687"
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

22/04/01 17:25:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Cipher Query Language

- These examples should be run from thhe Neo4j Web Client http://localhost:7474 

### Merge/Create data

```
merge(f1:Faculty{name:'Mike'})-[t1:TEACHES{semester:'fall2021'}]->(c1:Course{code:'IST769'})
merge(f1)-[t2:TEACHES{semester:'fall2021'}]->(c2:Course{code:'IST256'})
merge(f1)-[p1:PROFESSOR_OF_RECORD]->(c3:Course{code:'IST659'})
merge(f2:Faculty{name:'Jill'})-[t3:TEACHES{semester:'fall2021'}]->(c3))
merge(f1)-[:TEACHES{semester:'spring2020'}]->(c3)
```

### Match Queries to Read Data

```
# pull the entire graph
match (f:Faculty)-[t:TEACHES]->(c:Course) return f,c,t

# pull the instructors of IST659
match (f:Faculty)-[t:TEACHES]->(c:Course) where c.code="IST659" return f,t,c

# get data in a table: List of professors of record (note relationship direction)
match (c:Course)<-[:PROFESSOR_OF_RECORD]-(f:Faculty) return c.code, f.name

# Sorting. after you return
match (c:Course) return c.code order by c.code

```

### Querying the Movie Graph

```

1. Build a graph of movies and reviewers
MATCH (a:Person)-[d:REVIEWED]->(m:Movie) 
RETURN a,d,m


2. Complete the same thing, but only review ratings over 70

MATCH (a:Person)-[d:REVIEWED]->(m:Movie) 
where d.rating > 70
RETURN a,d,m

3. Build a graph of actors and directors with movies

MATCH (p1:Person)-[a:ACTED_IN]->(m:Movie)<-[d:DIRECTED]-(p2:Person)
RETURN p1,a,m,d,p2


4.Build a graph of people who acted and directed the same movie
MATCH (p:Person)-[a:ACTED_IN]->(m:Movie)<-[d:DIRECTED]-(p)
RETURN p,a,m,d


```


### Updates and Deletes


```
# Setting attributes - they are all ischool courses

match (c:Course) set c.college = "iSchool" return c

# make  IST769 graduate level course data science type
match (c:Course {code:"IST769"}) set c.level = "Graduate", c.type="Data Science" return c

# Make Jill the Professor of Record for IST659 instead of Mike

# Add the jill relationship
match (j:Faculty{name:"Jill"}),(c:Course{code:"IST659"})
merge (j)-[p:PROFESSOR_OF_RECORD]->(c)
return j,p,c

# Remove the mike relationship
match (m:Faculty{name:"Mike"})-[p:PROFESSOR_OF_RECORD]->(c:Course{code:"IST659"})
delete p
return m,c

```


### Graph Machine Learning Example

- This example Calculates the "sortest path" in a weighted graph

```
## Make the weighted graph

CREATE (a:Location {name: 'A'}), (b:Location {name: 'B'}), (c:Location {name: 'C'}), (d:Location {name: 'D'}), (e:Location {name: 'E'}), (f:Location {name: 'F'}), (a)-[:ROAD {cost: 50}]->(b), (a)-[:ROAD {cost: 50}]->(c), (a)-[:ROAD {cost: 100}]->(d), (b)-[:ROAD {cost: 40}]->(d), (c)-[:ROAD {cost: 40}]->(d), (c)-[:ROAD {cost: 80}]->(e), (d)-[:ROAD {cost: 30}]->(e), (d)-[:ROAD {cost: 80}]->(f), (e)-[:ROAD {cost: 40}]->(f);


## Then call the algo

MATCH (source:Location {name:'A'}), (target:Location {name:'F'})
CALL gds.shortestPath.dijkstra.stream('myGraph',{
   sourceNode:source, targetNode:target, relationshipWeightProperty:'cost'
})
YIELD index,sourceNode,targetNode,totalCost,nodeIds,costs,path
RETURN
   index,
   gds.util.asNode(sourceNode).name AS sourceNodeName,
   gds.util.asNode(targetNode).name AS targetNodeName,
   totalCost,
   [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodeNames,
   costs, nodes(path) as path
ORDER BY index

```


## Neo4j Spark Example

- sample example of Faculty and Courses

In [3]:
# Erase the data!
cipher_ql = '''
MATCH (n)
DETACH DELETE n
'''
df = spark.createDataFrame(data = [{'row':1}])
df.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

                                                                                

In [4]:
cols = ["facname","factitle","course","level","taught", "is_por"]
data = [('Mike','PoP', "IST256", "UGrad","Fall2021",False),('Mike','PoP', "IST659", "Grad","Spring2021",True),('Mike','PoP', "IST769", "Grad","Fall2021",False), ('Jill','Adjunct', "IST659", "Grad","Fall2021",False)]
profs = spark.createDataFrame(data = data, schema = cols)
profs.toPandas()

Unnamed: 0,facname,factitle,course,level,taught,is_por
0,Mike,PoP,IST256,UGrad,Fall2021,False
1,Mike,PoP,IST659,Grad,Spring2021,True
2,Mike,PoP,IST769,Grad,Fall2021,False
3,Jill,Adjunct,IST659,Grad,Fall2021,False


In [6]:
# Write Data With a Cipher Query. Note that "event" refers to each row in the data frame

print("Faculty...")
cipher_ql = "MERGE (f:Faculty {name: event.facname , title: event.factitle})"
x = profs.select("facname","factitle").distinct()
x.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()


Faculty...


                                                                                

In [7]:
print("Courses...")
cipher_ql = "MERGE (c:Courses {code: event.course, level: event.level })"
profs.select("course","level").distinct().write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()


Courses...


                                                                                

In [8]:
print("Teaching...")
cipher_ql = '''
MATCH (c:Courses {code: event.course}), (f:Faculty {name: event.facname})
MERGE (f)-[:TEACHES {semester: event.taught}]->(c)
'''
profs.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()



Teaching...


In [9]:
print("Professor of Record...")
cipher_ql = '''
MATCH (c:Courses {code: event.course}), (f:Faculty {name: event.facname})
MERGE (f)-[:PROFESSOR_OF_RECORD]->(c)
'''
profs.where("is_por").write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()

Professor of Record...


In [10]:
# Let's get the Grad teaching information:

cipher_ql = '''
MATCH (f:Faculty)-[t:TEACHES]->(c:Courses) 
WHERE c.level='Grad' 
RETURN c.code, c.level, t.semester, f.name, f.title
'''
#RETURN c.code, c.level, t.semester, f.name, f.title
df2 = spark.read.format("org.neo4j.spark.DataSource") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .load()
df2.toPandas()

Unnamed: 0,c.code,c.level,t.semester,f.name,f.title
0,IST769,Grad,Fall2021,Mike,PoP
1,IST659,Grad,Fall2021,Jill,Adjunct
2,IST659,Grad,Spring2021,Mike,PoP


In [None]:
# If you want to empty the graph database. This requires a "one row" dataframe as something must be sent to Neo4j 

cipher_ql = '''
MATCH (n)
DETACH DELETE n
'''
df = spark.createDataFrame(data = [{'row':1}]) # Dummy One Row DataFrame
df.write.format("org.neo4j.spark.DataSource").mode("Overwrite") \
  .option("url", bolt_url) \
  .option("query",cipher_ql) \
  .save()
