In [21]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

spark = SparkSession.builder.appName("SocialNetworkAnalysis").getOrCreate()
spark.sparkContext.setCheckpointDir('/checkpoints')

In [12]:
edges_df = spark.read.option("delimiter", ",").csv("musae_facebook_edges.csv", header=True, inferSchema=True)

In [13]:
edges_df.head(10)

[Row(src=0, dst=18427),
 Row(src=1, dst=21708),
 Row(src=1, dst=22208),
 Row(src=1, dst=22171),
 Row(src=1, dst=6829),
 Row(src=1, dst=16590),
 Row(src=1, dst=20135),
 Row(src=1, dst=8894),
 Row(src=1, dst=15785),
 Row(src=1, dst=10281)]

In [14]:
from_vertices = edges_df.select("src").distinct().withColumnRenamed("src", "id")
to_vertices = edges_df.select("dst").distinct().withColumnRenamed("dst", "id")

vertices_df = from_vertices.union(to_vertices).distinct()

In [15]:
vertices_df.head(10)

[Row(id=148),
 Row(id=463),
 Row(id=471),
 Row(id=496),
 Row(id=18800),
 Row(id=833),
 Row(id=5300),
 Row(id=1088),
 Row(id=1238),
 Row(id=1342)]

In [16]:
social_graph = GraphFrame(vertices_df, edges_df)
social_graph

GraphFrame(v:[id: int], e:[src: int, dst: int])

In [17]:
# a. Find the top 5 nodes with the highest outdegree and the count of outgoing edges
out_degree = social_graph.outDegrees
top_out_degree_nodes = out_degree.orderBy("outDegree", ascending=False).limit(5)
top_out_degree_nodes.toPandas()

Unnamed: 0,id,outDegree
0,1387,472
1,2442,365
2,701,364
3,8139,261
4,5458,247


In [18]:
# b. Find the top 5 nodes with the highest indegree and count of incoming edges
in_degree = social_graph.inDegrees
top_in_degree_nodes = in_degree.orderBy("inDegree", ascending=False).limit(5)
top_in_degree_nodes.toPandas()

Unnamed: 0,id,inDegree
0,21729,643
1,19743,605
2,16895,551
3,14497,430
4,19347,385


In [19]:
# c. Calculate PageRank for each node and output the top 5 nodes
pagerank = social_graph.pageRank(resetProbability=0.15, tol=0.01)
top_pagerank_nodes = pagerank.vertices.orderBy("pagerank", ascending=False).limit(5)
top_pagerank_nodes.toPandas()

Unnamed: 0,id,pagerank
0,21729,117.55737
1,22208,66.337474
2,21781,55.797611
3,22440,54.630145
4,22057,47.474781


In [22]:
# d. Run the connected components algorithm and find the top 5 components with the largest number of nodes
connected_components = social_graph.connectedComponents()
top_components = connected_components.groupBy("component").count().orderBy("count", ascending=False).limit(5)
top_components.toPandas()

Unnamed: 0,component,count
0,0,22470


In [25]:
# e. Run the triangle counts algorithm on each of the vertices and output the top 5 vertices with the largest triangle count
triangle_counts = social_graph.triangleCount()
top_triangles = triangle_counts.orderBy("count", ascending=False).limit(5)
top_triangles.toPandas()

Unnamed: 0,count,id
0,16219,16895
1,14050,14497
2,10199,1387
3,9754,19743
4,9254,21729
