In [1]:
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.enableHiveSupport().master("local").getOrCreate()

In [2]:
graphPath = "/data/graphDFSample"

In [5]:
import itertools

from pyspark.sql.functions import explode, collect_list, size, col, sort_array, udf
from pyspark.sql import Row
from pyspark.sql import Window
from pyspark.sql.types import ArrayType, IntegerType, StructField, StructType

ResultPair = Row('user1', 'user2')

def emit_pairs(users): 
    return [ResultPair(pair[0], pair[1]) for pair in itertools.combinations(users, 2)]

emit_pairs_udf = udf(emit_pairs, ArrayType(StructType([StructField("user1", IntegerType(), True), \
                                                          StructField("user2", IntegerType(), True)])))

graph = sparkSession.read.parquet(graphPath) \
    .withColumn("friend", explode('friends')) \
    .groupBy("friend") \
    .agg(sort_array(collect_list("user")).alias("users")) \
    .withColumn("users_size", size("users")) \
    .filter(col("users_size") > 1) \
    .withColumn("pair", explode(emit_pairs_udf("users"))) \
    .drop('users') \
    .drop('users_size') \
    .withColumn('user1', col("pair.user1")) \
    .withColumn('user2', col("pair.user2"))

In [None]:
from pyspark.sql.functions import col, count, row_number

friendsCount = graph \
    .groupBy(["user1", "user2"]) \
    .agg(count("friend").alias("friends_count"))

window = Window.orderBy(col("friends_count").desc())
    
top50 = friendsCount \
    .withColumn("row_number", row_number().over(window)) \
    .filter(col("row_number") < 50) \
    .select(col("friends_count"), col("user1"), col("user2")) \
    .orderBy(col("friends_count").desc()) \
    .collect()
    
for entry in top50:
    print '%s %s %s' % entry