# GraphFrames example
Last updated November 7, 2019

w261 - Machine learning at Scale   
Kyle Hamilton

Spark packages
https://spark-packages.org/

In [8]:
import os
PACKAGES = ['graphframes:graphframes:0.7.0-spark2.3-s_2.11']
os.environ['PYSPARK_SUBMIT_ARGS'] = f"--packages {','.join(PACKAGES)} pyspark-shell"
print(os.environ['PYSPARK_SUBMIT_ARGS'])

--packages graphframes:graphframes:0.7.0-spark2.3-s_2.11 pyspark-shell


In [9]:
# start Spark Session
from pyspark.sql import SparkSession
app_name = "graphframe_demo"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

In [10]:
spark

In [11]:
# Create a Vertex DataFrame with unique ID column "id"
v = spark.createDataFrame([
  ("a", "Alice", 34),
  ("b", "Bob", 36),
  ("c", "Charlie", 30),
], ["id", "name", "age"])
# Create an Edge DataFrame with "src" and "dst" columns
e = spark.createDataFrame([
  ("a", "b", "friend"),
  ("b", "c", "follow"),
  ("c", "b", "follow"),
], ["src", "dst", "relationship"])

In [12]:
v.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  a|  Alice| 34|
|  b|    Bob| 36|
|  c|Charlie| 30|
+---+-------+---+



In [13]:
e.show()

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  a|  b|      friend|
|  b|  c|      follow|
|  c|  b|      follow|
+---+---+------------+



In [14]:
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Query: Count the number of "follow" connections in the graph.
g.edges.filter("relationship = 'follow'").count()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

+---+--------+
| id|inDegree|
+---+--------+
|  c|       1|
|  b|       2|
+---+--------+

+---+------------------+
| id|          pagerank|
+---+------------------+
|  b|1.0905890109440908|
|  a|              0.01|
|  c|1.8994109890559092|
+---+------------------+



In [15]:
!cat test_graph.txt

2	{'3': 1}
3	{'2': 1}
4	{'1': 1, '2': 1}
5	{'4': 1, '2': 1, '6': 1}
6	{'2': 1, '5': 1}
7	{'2': 1, '5': 1}
8	{'2': 1, '5': 1}
9	{'2': 1, '5': 1}
10	{'5': 1}
11	{'5': 1}

In [16]:
graphData = spark.read.format("com.databricks.spark.csv") \
    .option("delimiter", "\t") \
    .option("header", "false") \
    .load('test_graph.txt')

In [17]:
graphData.take(1)

[Row(_c0='2', _c1="{'3': 1}")]

In [18]:
v = graphData.select('_c0').withColumnRenamed('_c0','id')

In [19]:
v.show()

+---+
| id|
+---+
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
+---+



In [20]:
import json
import ast
def getEdges(row):
    node_id, nodes = row
    nodes = ast.literal_eval(nodes)
    for node in nodes: 
        yield int(node_id), int(node)

In [21]:
e = spark.createDataFrame(graphData.rdd.flatMap(getEdges), ["src", "dst"])

In [22]:
e.show()

+---+---+
|src|dst|
+---+---+
|  2|  3|
|  3|  2|
|  4|  1|
|  4|  2|
|  5|  4|
|  5|  2|
|  5|  6|
|  6|  2|
|  6|  5|
|  7|  2|
|  7|  5|
|  8|  2|
|  8|  5|
|  9|  2|
|  9|  5|
| 10|  5|
| 11|  5|
+---+---+



In [79]:
# Create a GraphFrame
from graphframes import *
g = GraphFrame(v, e)

# Query: Get in-degree of each vertex.
g.inDegrees.show()

# Run PageRank algorithm, and show results.
results = g.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank").show()

+---+--------+
| id|inDegree|
+---+--------+
|  6|       1|
|  5|       6|
|  1|       1|
|  3|       1|
|  2|       7|
|  4|       1|
+---+--------+

+---+--------------------+
| id|            pagerank|
+---+--------------------+
|  8|0.010000000000000014|
|  4| 0.02956384319612453|
|  6| 0.02956384319612453|
| 11|0.010000000000000014|
| 10|0.010000000000000014|
|  5|0.059284062207826185|
|  3|   6.360951017965119|
|  9|0.010000000000000014|
|  7|0.010000000000000014|
|  2|  3.4706372334348052|
+---+--------------------+



In [82]:
# from pyspark.sql import Row
# from pyspark.sql import types
# import pyspark.sql.functions as F
# # Not what we want
# graphData.select(F.explode(graphData._c1).alias("key", "value")).show()