In [1]:
from pyspark.sql import SparkSession
from graphframes import *

In [2]:
# Creating vertices and edges for the formation of the graph

In [3]:
characters = sqlContext.sql("SELECT * FROM characters_csv")
dialogues=sqlContext.sql("SELECT * FROM edges_graphs_csv")

In [4]:
# Vertices corresponds to various characters in the simpson's series in my analysis

In [5]:
display(characters)

In [6]:
characters.printSchema()

In [7]:
# The relationship between the various characters are connected via their speaking state which represents the edges between the vertices

In [8]:
display(dialogues)

In [9]:
dialogues.printSchema()

In [10]:
# Creating the graph 

In [11]:
g = GraphFrame(characters, dialogues)
print g

In [12]:
# Their are 6723 vertices and 158271 edges in the graph created

In [13]:
print("Total Number of characters: ")
print(g.vertices.count())
print("Total Number of conversation in Graph: " )
print(g.edges.count())

In [14]:
display(g.vertices)

In [15]:
display(g.edges)

In [16]:
display(g.inDegrees)

In [17]:
display(g.outDegrees)

In [18]:
display(g.degrees)

In [19]:
a = g.vertices.filter("id=1")
display(a)

In [20]:
# Calculating the number of True and False Speaking relationship between the characters

In [21]:
scenes = g.edges.filter("relationship = 'FALSE'").count()
print "The number of follow edges is", scenes

In [22]:
conversations = g.edges.filter("relationship = 'TRUE'").count()
print "The number of follow edges is", conversations

In [23]:
# The number of True follow edges is 132112 and False follow edges is 26159 in the graph

In [24]:
# Motif Finding

In [25]:
motifs = g.find("(1)-[3]->(2); (2)-[4]->(1)")
display(motifs)

In [26]:
# Breadth First Search Algorithm

In [27]:
paths = g.bfs("character = 'Homer Simpson'", "id =1")
display(paths)

In [28]:
filteredPaths = g.bfs(
  fromExpr = "character = 'Homer Simpson'",
  toExpr = "id =1",
  edgeFilter = "relationship != 'FALSE'",
  maxPathLength = 3)
display(filteredPaths)

In [29]:
# Calculating Page Rank

In [30]:
results = g.pageRank(resetProbability=0.15, tol=0.01)
display(results.vertices)

In [31]:
# Shortest Path Algorithm

In [32]:
results = g.shortestPaths(landmarks=["1", "2"])
display(results)