In [0]:
edgesDF = (
  sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=',', header='true', inferschema='true')
    .load("/FileStore/tables/comp4651-project/musae_squirrel_edges.csv")
)

trafficDF = (
    sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=',', header='true', inferschema='true')
    .load("/FileStore/tables/comp4651-project/musae_squirrel_target.csv")
)

In [0]:
num_nodes = trafficDF.count()
num_edges = edgesDF.count()
print("num of nodes: {}".format(num_nodes))
print("num of edges: {}".format(num_edges))

In [0]:
### PART 1.1 - Analysis the distribution of node degree ###
# The graph is undirected, deg of a node equals the sum of count in id1 and id2

nodeDeg = (
  edgesDF.select('id1')
          .union(edgesDF.select('id2'))
          .groupBy('id1')
          .count()
          .toDF('id', 'degree')
)

In [0]:
from pyspark.sql.functions import expr
# determine the distribution function of node degree

nodeDegStat = (
  nodeDeg.groupBy('degree').count()
    .withColumn('logD', expr('log10(degree)')) # degree on log scale
    .withColumn('pdf', expr('count/{}'.format(num_nodes))) # probalistic distribution function (pdf)
    .withColumn('logp', expr('log10(pdf)')) # pdf on log scale
    .withColumn('cdf', expr('sum(pdf) over (order by degree)')) # cummulative distribution function (cdf)
    .withColumn('ccdf', expr('1-cdf')) # comptemporary cummulative distribution fucntion (ccdf)
    .withColumn('log1-P', expr('log10(ccdf)')) # ccdf on log scale
)

In [0]:
# drop the last row for display purpose
display(nodeDegStat.head(nodeDegStat.count()-1))

degree,count,logD,pdf,logp,cdf,ccdf,log1-P
1,137,0.0,0.0263410882522591,-1.5793662866184253,0.0263410882522591,0.9736589117477408,-0.0115931567655334
2,157,0.3010299956639812,0.0301865025956546,-1.5201872013655984,0.0565275908479138,0.943472409152086,-0.0252707957939166
3,182,0.4771212547196624,0.034993270524899,-1.4560154657897573,0.0915208613728129,0.9084791386271872,-0.0416850409295503
4,192,0.6020599913279624,0.0369159776965968,-1.4327856250712825,0.1284368390694097,0.8715631609305903,-0.0597011347161443
5,202,0.6989700043360189,0.0388386848682945,-1.4107354843282085,0.1672755239377042,0.8327244760622957,-0.0794986700449898
6,186,0.7781512503836436,0.0357623533935781,-1.446573909556916,0.2030378773312824,0.7969621226687176,-0.0985623188885397
7,225,0.8450980400142568,0.0432609113631993,-1.3639043356634697,0.2462987886944818,0.7537012113055181,-0.1228007867543748
8,171,0.9030899869919436,0.0328782926360315,-1.4830907433826783,0.2791770813305134,0.7208229186694866,-0.1421714133532814
9,160,0.9542425094393248,0.030763314747164,-1.5119668711189074,0.3099403960776773,0.6900596039223226,-0.1611133954415923
10,147,1.0,0.0282637954239569,-1.548769519026656,0.3382041915016343,0.6617958084983657,-0.1792759877832906


In [0]:
# perform linear regrssion on logD against logp (pdf)

# perform linear regression on logD against log1-P (ccdf)

In [0]:
### PART 1.2 - analysis the distribution of node traffic ###

trafficStat = (trafficDF
                 .groupBy(expr('cast(log10(target) as decimal(5, 1)) as logT')) # group the traffic data in log scale, round down to 1 decimal space
                 .count()
                 .orderBy('logT')
                 .withColumn('pdf', expr('count/{}'.format(num_nodes))) # probalistic distribution function
                 .withColumn('cdf', expr('sum(pdf) over (order by logT)')) # cummulative distribution function
              )

In [0]:
display(trafficStat)

logT,count,pdf,cdf
1.2,1,0.00019227071716977504,0.00019227071716977504
1.6,3,0.0005768121515093252,0.0007690828686791001
1.7,6,0.0011536243030186,0.0019227071716977
1.8,14,0.0026917900403768,0.0046144972120746
1.9,20,0.0038454143433955,0.0084599115554701
2.0,34,0.0065372043837723,0.0149971159392424
2.1,35,0.0067294751009421,0.0217265910401845
2.2,23,0.0044222264949048,0.0261488175350894
2.3,41,0.0078830994039607,0.0340319169390501
2.4,44,0.0084599115554701,0.0424918284945202


In [0]:
logTraffic = trafficDF.selectExpr('log10(target) as logT')
print(logTraffic.selectExpr('mean(logT) as mean').first())
print(logTraffic.selectExpr('percentile_approx(logT, 0.5) as median').first())
print(trafficDF.selectExpr('log10(target) as logT').selectExpr('stddev(logT) as sd').first())

In [0]:
### PART 1.3 - analysis the relationship between the node degree and traffic size ###

nodeDF = nodeDeg.join(trafficDF, nodeDeg.id == trafficDF.id)
display(nodeDF)

id,degree,id.1,target
4935,428,4935,3173
4101,10,4101,29367
1959,120,1959,21075
1829,17,1829,66
3749,421,3749,487
2659,5,2659,25639
1088,3,1088,9410
3918,11,3918,434
148,184,148,222045
1645,32,1645,43592
