Notebook Content:
- Create spark graphframe.
- Graphframe operations.
- Motifs finding in GraphFrames.
- Subgraph.

In [1]:
import findspark
findspark.init()
import pyspark 
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from graphframes import *

In [2]:
# 1) Create graph frame
# 2) Make operations on graphframe

In [3]:
# You can find ->
# in degrees: how many inputs for this node
# out degrees: how many outputs for it

In [4]:
spark = SparkSession.builder.appName('graphframes').getOrCreate()

In [5]:
# prepare for creating a GF
data = [('a', 'Alice', 34),
        ('b', 'bob', 30),
        ('c', 'charlie', 26)]

relations = [('a', 'b', 'friend'),
             ('b', 'c', 'follow'),
             ('c', 'b', 'follow')]

In [6]:
vert = spark.createDataFrame(data, ['id', 'name', 'age'])

In [7]:
edge = spark.createDataFrame(relations, ['src', 'dst', 'relationshps'])

In [8]:
# Actual creating the graphframe
# This GF is a lazy
graphframe = GraphFrame(vert, edge)

In [9]:
# As a data frame
graphframe

GraphFrame(v:[id: string, name: string ... 1 more field], e:[src: string, dst: string ... 1 more field])

**Graph operations**

In [10]:
# Also a lazy operation
graphframe.inDegrees

DataFrame[id: string, inDegree: int]

In [11]:
# any node its indegrees is zero not displayed 
graphframe.inDegrees.show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



In [12]:
df_in = graphframe.inDegrees

In [13]:
df_in.filter('inDegree>=2').show()

+---+--------+
| id|inDegree|
+---+--------+
|  b|       2|
+---+--------+



In [14]:
graphframe.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    bob| 30|
|  c|charlie| 26|
+---+-------+---+



In [15]:
graphframe.edges.show()

+---+---+------------+
|src|dst|relationshps|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
+---+---+------------+



In [16]:
graphframe.inDegrees.sort('inDegree').show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+



In [17]:
graphframe.inDegrees.sort('id').show()

+---+--------+
| id|inDegree|
+---+--------+
|  b|       2|
|  c|       1|
+---+--------+



**Motif finding**

In [18]:
# New graphframe
vert = spark.createDataFrame([('a','Alice',34),
                             ('b','Bob',36),
                             ('c','Charlie',30),
                             ('d','David',29),
                             ('e','Esther',32),
                             ('f','Fanny',36)],
                            ["id", "name", "age"])

In [19]:
edg = spark.createDataFrame([('a','e','friend'),
                             ('f','b','follow'),
                             ('c','e','friend'),
                             ('a','b','friend'),
                             ('b','c','follow'),
                             ('c','b','follow'),
                             ('f','c','follow'),
                             ('e','f','follow'),
                             ('e','d','friend'),
                             ('d','a','friend')],
                           ["src", "dst", "relationship"])

In [20]:
gf = GraphFrame(vert, edg)

In [21]:
# Motif finding: I want to find a pattern in the graph
# Find the pattern that goes from one node throught one edge to another node
gf.find('(n1)-[e1]->(n2)').show()

+----------------+--------------+----------------+
|              n1|            e1|              n2|
+----------------+--------------+----------------+
| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|  [f, Fanny, 36]|[f, b, follow]|    [b, Bob, 36]|
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|
+----------------+--------------+----------------+



In [22]:
m = gf.find('(n1)-[e1]->(n2); (n2)-[e2]->(n3)')
m.show()

+----------------+--------------+----------------+--------------+----------------+
|              n1|            e1|              n2|            e2|              n3|
+----------------+--------------+----------------+--------------+----------------+
|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|  [f, Fanny, 36]|[f, b, follow]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|[c,

In [23]:
m.filter('n1.age>30').show()

+---------------+--------------+----------------+--------------+----------------+
|             n1|            e1|              n2|            e2|              n3|
+---------------+--------------+----------------+--------------+----------------+
| [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
| [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
|   [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
| [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|[e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
| [f, Fanny, 36]|[f, b, follow]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
| [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|
|[e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|[f, b, follow]|    [b, Bob, 36]|
| [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|
|   [b, Bob, 36]

In [24]:
m.printSchema()

root
 |-- n1: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)
 |-- e1: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- relationship: string (nullable = true)
 |-- n2: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)
 |-- e2: struct (nullable = false)
 |    |-- src: string (nullable = true)
 |    |-- dst: string (nullable = true)
 |    |-- relationship: string (nullable = true)
 |-- n3: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- age: long (nullable = true)



In [25]:
# Show only nodes without edges
gf.find('(n1)-[]->(n2); (n2)-[]->(n3)').show()

+----------------+----------------+----------------+
|              n1|              n2|              n3|
+----------------+----------------+----------------+
|[c, Charlie, 30]| [e, Esther, 32]|  [f, Fanny, 36]|
|  [a, Alice, 34]| [e, Esther, 32]|  [f, Fanny, 36]|
|  [f, Fanny, 36]|[c, Charlie, 30]| [e, Esther, 32]|
|    [b, Bob, 36]|[c, Charlie, 30]| [e, Esther, 32]|
|  [d, David, 29]|  [a, Alice, 34]| [e, Esther, 32]|
|[c, Charlie, 30]| [e, Esther, 32]|  [d, David, 29]|
|  [a, Alice, 34]| [e, Esther, 32]|  [d, David, 29]|
| [e, Esther, 32]|  [f, Fanny, 36]|[c, Charlie, 30]|
|  [f, Fanny, 36]|    [b, Bob, 36]|[c, Charlie, 30]|
|[c, Charlie, 30]|    [b, Bob, 36]|[c, Charlie, 30]|
|  [a, Alice, 34]|    [b, Bob, 36]|[c, Charlie, 30]|
| [e, Esther, 32]|  [f, Fanny, 36]|    [b, Bob, 36]|
|  [f, Fanny, 36]|[c, Charlie, 30]|    [b, Bob, 36]|
|    [b, Bob, 36]|[c, Charlie, 30]|    [b, Bob, 36]|
|  [d, David, 29]|  [a, Alice, 34]|    [b, Bob, 36]|
| [e, Esther, 32]|  [d, David, 29]|  [a, Alice

In [26]:
gf.find('(n1)-[e1]->(n2); (n2)-[e2]->(n1)').show()

+----------------+--------------+----------------+--------------+
|              n1|            e1|              n2|            e2|
+----------------+--------------+----------------+--------------+
|[c, Charlie, 30]|[c, b, follow]|    [b, Bob, 36]|[b, c, follow]|
|    [b, Bob, 36]|[b, c, follow]|[c, Charlie, 30]|[c, b, follow]|
+----------------+--------------+----------------+--------------+



In [27]:
# We need to say don't include the reversed relationship this can be done using the below line
# gf.find('(n1)-[e1]->(n2); !(n2)-[e1]->(n1)').show()
# but this line is going to result an error because we cannot include a condition and it's not state at the same time

In [28]:
# Solving the last error
gf.find('(n1)-[e1]->(n2); !(n2)-[]->(n1)').show()

+----------------+--------------+----------------+
|              n1|            e1|              n2|
+----------------+--------------+----------------+
|  [f, Fanny, 36]|[f, b, follow]|    [b, Bob, 36]|
|  [a, Alice, 34]|[a, e, friend]| [e, Esther, 32]|
| [e, Esther, 32]|[e, f, follow]|  [f, Fanny, 36]|
|[c, Charlie, 30]|[c, e, friend]| [e, Esther, 32]|
|  [a, Alice, 34]|[a, b, friend]|    [b, Bob, 36]|
| [e, Esther, 32]|[e, d, friend]|  [d, David, 29]|
|  [f, Fanny, 36]|[f, c, follow]|[c, Charlie, 30]|
|  [d, David, 29]|[d, a, friend]|  [a, Alice, 34]|
+----------------+--------------+----------------+



In [29]:
m.filter('e1.relationship = "friend"').filter('n2.age = 32').show()

+----------------+--------------+---------------+--------------+--------------+
|              n1|            e1|             n2|            e2|            n3|
+----------------+--------------+---------------+--------------+--------------+
|[c, Charlie, 30]|[c, e, friend]|[e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|
|  [a, Alice, 34]|[a, e, friend]|[e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|
|[c, Charlie, 30]|[c, e, friend]|[e, Esther, 32]|[e, d, friend]|[d, David, 29]|
|  [a, Alice, 34]|[a, e, friend]|[e, Esther, 32]|[e, d, friend]|[d, David, 29]|
+----------------+--------------+---------------+--------------+--------------+



**Subgraph**

In [30]:
gf.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
+---+-------+---+



In [31]:
# Filtering the vertices also force the graph to remove the edges for the removed vertices
g1 = gf.filterVertices('age>35')

In [32]:
g1.vertices.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  b|  Bob| 36|
|  f|Fanny| 36|
+---+-----+---+



In [33]:
g1.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  f|  b|      follow|
+---+---+------------+



In [34]:
# Filter using edges doesn't force removing vertices, so we will find many isolated vertices.
# We can see here f is an isolated vertices which has no edges.
g2 = gf.filterEdges('relationship="friend"')

In [35]:
g2.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
|  d|  David| 29|
|  e| Esther| 32|
|  f|  Fanny| 36|
+---+-------+---+



In [36]:
g2.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  e|      friend|
|  c|  e|      friend|
|  a|  b|      friend|
|  e|  d|      friend|
|  d|  a|      friend|
+---+---+------------+



In [37]:
# Multi Condition
g3 = gf.filterVertices('age>30')\
       .filterEdges('relationship="friend"').dropIsolatedVertices()

In [38]:
g3.vertices.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  e|Esther| 32|
|  b|   Bob| 36|
|  a| Alice| 34|
+---+------+---+



In [39]:
g3.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  e|      friend|
|  a|  b|      friend|
+---+---+------------+



In [40]:
# More complex operations
df_for_sub_graph = gf.find('(n1)-[e1]->(n2)')\
                     .filter('e1.relationship = "follow"')\
                     .filter('n1.age < n2.age')

In [41]:
df_for_sub_graph.show()

+----------------+--------------+--------------+
|              n1|            e1|            n2|
+----------------+--------------+--------------+
| [e, Esther, 32]|[e, f, follow]|[f, Fanny, 36]|
|[c, Charlie, 30]|[c, b, follow]|  [b, Bob, 36]|
+----------------+--------------+--------------+



In [42]:
# Now we need to reconstruct a new graph out of this dataframe.
# So to build a graph frame we need vertices and edges.
edges = df_for_sub_graph.select('e1.src', 'e1.dst', 'e1.relationship')
new_gf_sub = GraphFrame(gf.vertices, edges).dropIsolatedVertices()

In [43]:
new_gf_sub.vertices.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  f|  Fanny| 36|
|  e| Esther| 32|
|  c|Charlie| 30|
|  b|    Bob| 36|
+---+-------+---+



In [44]:
new_gf_sub.edges.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  e|  f|      follow|
|  c|  b|      follow|
+---+---+------------+



In [45]:
# Save our vertices and edges to re-create our graphframe later.
new_gf_sub.vertices.write.parquet('vertices')
new_gf_sub.edges.write.parquet('edges')