In [0]:
import numpy as np
from operator import add

edgesRDD = sc.textFile('/FileStore/tables/comp4651-project/musae_squirrel_edges.csv', 8)
header = edgesRDD.first()
edgesRDD = (
  edgesRDD.filter(lambda line: line != header)
      .map(lambda line: tuple(map(int, line.split(','))))
      .distinct()
)

def join_list(list1, list2):
  if list1 is None:
    return list2
  if list2 is None:
    return list1
  else:
    return list1+list2

nodesRDD = (
  edgesRDD.groupByKey().mapValues(list)
    .fullOuterJoin(edgesRDD.map(lambda kv: (kv[1], kv[0])).groupByKey().mapValues(list))
    .mapValues(lambda v: join_list(v[0], v[1]))
    .mapValues(np.unique)
).cache()

numNodes = nodesRDD.count()

In [0]:
adjanceyDict = sc.broadcast(dict(nodesRDD.collect()))

In [0]:
def findClusterCoefficient(neighbors):
  numNeighbors = len(neighbors)
  maxEdges = numNeighbors * (numNeighbors - 1)
  if maxEdges == 0:
    return 0
  edgesBetweenNeighbors = 0
  for neighbor in neighbors:
      edgesBetweenNeighbors += len(np.intersect1d(neighbors, adjanceyDict.value[neighbor]))
  return min(1.0, edgesBetweenNeighbors/maxEdges)
  
clusterCoefficient = nodesRDD.mapValues(findClusterCoefficient).cache()

In [0]:
from pyspark.sql.functions import expr

ccDF = clusterCoefficient.toDF(['id', 'cc'])
ccStat = (
  ccDF.withColumn('group', expr('int(cc/0.02)'))
    .groupBy('group')
    .count()
    .selectExpr('group * 0.02 as cc','count as freq')
    .orderBy('cc')
    .withColumn('logC', expr('log10(cc)'))
    .withColumn('p', expr('freq/{}'.format(numNodes)))
    .withColumn('pdf', expr('p*50'))
    .withColumn('cdf', expr('sum(p) over (order by cc)'))
)

In [0]:
display(ccStat)

cc,freq,logC,p,pdf,cdf
,156,,0.0299942318784849,1.4997115939242454,0.0299942318784849
0.0,181,,0.0348009998077292,1.7400499903864644,0.0647952316862141
0.02,29,-1.6989700043360187,0.0055758507979234,0.2787925398961738,0.0703710824841376
0.04,39,-1.3979400086720375,0.0074985579696212,0.3749278984810613,0.0778696404537588
0.06,42,-1.2218487496163564,0.0080753701211305,0.4037685060565276,0.0859450105748894
0.08,64,-1.0969100130080565,0.0123053258988656,0.6152662949432801,0.098250336473755
0.1,81,-1.0,0.0155739280907517,0.7786964045375889,0.1138242645645068
0.12,89,-0.9208187539523752,0.0171120938281099,0.8556046914054989,0.1309363583926168
0.14,86,-0.8538719643217619,0.0165352816766006,0.8267640838300326,0.1474716400692174
0.16,123,-0.7958800173440752,0.0236492982118823,1.1824649105941163,0.1711209382810997


In [0]:
clusterCoefficient.map(lambda kv: '{} {}'.format(kv[0], kv[1])).coalesce(1).saveAsTextFile('/FileStore/tables/comp4651-project/output1')