# GraphFrames

### Setup

In [2]:
# https://spark-packages.org/package/graphframes/graphframes

# pyspark --packages graphframes:graphframes:0.7.0-spark2.4-s_2.11

# or everytime autoload
# cp /opt/spark/spark-2.4.4-bin-hadoop2.7/conf/spark-defaults.conf.template /opt/spark/spark-2.4.4-bin-hadoop2.7/conf/spark-defaults.conf

# export SPARK_OPTS="--packages graphframes:graphframes:0.7.0-spark2.4-s_2.11" to bashrc

# add spark.jars.packages graphframes:graphframes:0.7.0-spark2.4-s_2.11

In [3]:
# Run first time to include graphframe API in jupyter

from pyspark import SparkContext

sc = SparkContext()
sc.addPyFile('/opt/spark/spark-current/jars/graphframes-0.7.0-spark2.4-s_2.11.jar')

### Using

In [17]:

from pyspark.sql import SparkSession
from graphframes import *

In [19]:
spark = SparkSession.builder.appName('goose').getOrCreate()

In [38]:
# must have a column 'id'
vertices = spark.createDataFrame((
    ("a", 'Hadoop', 500),
    ("b", 'Java', 600)
)).toDF('id', 'name', 'num')

# must have cols 'src', 'dst'
edges = spark.createDataFrame((
    ('a', 'b', 'also_purchased'), 
    ('b', 'a', 'looked_at')
)).toDF('src', 'dst', 'type')

graphDF = GraphFrame(vertices, edges)

inDF = graphDF.inDegrees  # row1 with b, row2 with a
outDF = graphDF.outDegrees   # row1 with a, row2 with b

allDF = inDF.join(outDF, 'id')
allDF.show()


# graphDF.edges.groupBy({'count', 'id'}).show()
# graphDF.edges.filter("type == 'also_purchased'").show()
# graphDF.find('(a)-(e1)->(b);(a)-(e2)->(c)').filter('b != c') # (motif pattern) finds all paths that include a user buying a then b and a then c, filter to not repeat edge

# # AggregateMessages in GraphFrame API sends data between vertices for compund results (like avearge)
# # greatest inDegrees/outDegrees joined is the most visited and least started from

# graphDF.bfs("name == 'Mark'", "age > 10", maxPathLength=3).show() # breadth first search

+---+--------+---------+
| id|inDegree|outDegree|
+---+--------+---------+
|  b|       1|        1|
|  a|       1|        1|
+---+--------+---------+

