In [0]:
from operator import add

edgesRDD = sc.textFile('/FileStore/tables/comp4651-project/musae_squirrel_edges.csv', 8)
header = edgesRDD.first()
edgesRDD = (
  edgesRDD.filter(lambda line: line != header)
      .map(lambda line: tuple(map(int, line.split(','))))
)

def join_list(list1, list2):
  if list1 is None:
    return list2
  if list2 is None:
    return list1
  else:
    return list1+list2

nodesRDD = (
  edgesRDD.groupByKey().mapValues(list)
    .fullOuterJoin(edgesRDD.map(lambda kv: (kv[1], kv[0])).groupByKey().mapValues(list))
    .mapValues(lambda v: join_list(v[0], v[1]))
).cache()

numNodes = nodesRDD.count()

In [0]:
def computeContribs(node_neighbors_rank):
  neighbors = node_neighbors_rank[1][0]
  rank = node_neighbors_rank[1][1]
  numNeighbors = len(neighbors)
  for neighbor in neighbors:
    yield (neighbor, rank / numNeighbors)


ranks = nodesRDD.mapValues(lambda _: 1.0)
alpha = 0.15

for iteration in range(100):
  contribs = nodesRDD.join(ranks).flatMap(computeContribs)
  ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * (1.0-alpha) + alpha)

In [0]:
pageRankDF = ranks.toDF(['id', 'rank'])

In [0]:
from pyspark.sql.functions import expr

pageRankStat = (
  pageRankDF
    .withColumn('group', expr('int(rank/0.1)'))
    .groupBy('group')
    .count()
    .selectExpr('group * 0.1 as rank','count as freq')
    .orderBy('rank')
    .withColumn('logR', expr('log10(rank)'))
    .withColumn('p', expr('freq/{}'.format(numNodes)))
    .withColumn('pdf', expr('p*10'))
    .withColumn('logp', expr('log(pdf)'))
    .withColumn('cdf', expr('sum(p) over (order by rank)'))
    .withColumn('ccdf', expr('1-cdf'))
    .withColumn('log1-P', expr('log(ccdf)'))
)

In [0]:
lrDF = (
  pageRankStat
      .select('log1-P', 'logR')
      .where('logR < 0.75')
)

from pyspark.ml.feature import VectorAssembler

vectorizer = VectorAssembler()
vectorizer.setInputCols(["logR"])
vectorizer.setOutputCol("features")

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

lr = LinearRegression()

lr.setPredictionCol("Fit_log1-P")\
  .setLabelCol("log1-P")\
  .setMaxIter(200)\
  .setRegParam(0.01)

lrPipeline = Pipeline()

lrPipeline.setStages([vectorizer, lr])

lrModel = lrPipeline.fit(lrDF)

In [0]:
# The intercept is as follows:
intercept = lrModel.stages[1].intercept

# The coefficents (i.e., weights) are as follows:
weights = lrModel.stages[1].coefficients

# Create a list of the column names (without PE)
featuresNoLabel = [col for col in lrDF.columns if col != "log1-P"]

# Merge the weights and labels
coefficents = zip(weights, featuresNoLabel)

equation = "log1-P = {intercept}".format(intercept=intercept)

variables = []
for x in coefficents:
    weight = abs(x[0])
    name = x[1]
    symbol = "+" if (x[0] > 0) else "-"
    equation += (" {} ({} * {})".format(symbol, weight, name))

# Finally here is our equation
print("Linear Regression Equation: " + equation)

resultsDF = lrModel.transform(lrDF).select("logR","log1-P", "Fit_log1-P")

display(resultsDF)

logR,log1-P,Fit_log1-P
-1.0,-0.021573084228504,0.7811263786592846
-0.6989700043360187,-0.1775263044864692,0.0273001987688115
-0.5228787452803376,-0.3949457417167903,-0.4136598485289955
-0.3979400086720376,-0.5931461271341023,-0.7265259811216614
-0.3010299956639812,-0.745442808747794,-0.9692038070900196
-0.2218487496163563,-0.8576699939596247,-1.1674860284194688
-0.1549019599857432,-0.9728498930659606,-1.3351312578321155
-0.0969100130080563,-1.1211681739585218,-1.4803521610121346
-0.0457574905606751,-1.2105959838597804,-1.608446075717276
0.0,-1.3561813008281156,-1.7230299869804926


In [0]:
# ranks.map(lambda kv: '{} {}'.format(kv[0], kv[1])).coalesce(1).saveAsTextFile('/FileStore/tables/comp4651-project/output2')

In [0]:
pageRankStat = pageRankStat.withColumn(
  'fit_log1-P', expr('case when (-1.723 - (2.5 * logR) > 0) then 0 else (-1.723 - (2.5 * logR)) end')
  'fit_log'
)

In [0]:
display(pageRankStat.where('rank<25'))

rank,freq,logR,p,pdf,logp,cdf,ccdf,log1-P,fit_log1-P
0.1,111,-1.0,0.021342049605845,0.2134204960584503,-1.5444908994666928,0.021342049605845,0.9786579503941548,-0.021573084228504,0.0
0.2,735,-0.6989700043360187,0.1413189771197846,1.4131897711978467,0.34584939843381,0.1626610267256297,0.8373389732743703,-0.1775263044864692,0.0
0.3,851,-0.5228787452803376,0.1636223803114785,1.6362238031147855,0.4923910277943473,0.3262834070371082,0.6737165929628918,-0.3949457417167903,-0.4158031367991561
0.4,630,-0.3979400086720376,0.1211305518169582,1.2113055181695829,0.1916987186065516,0.4474139588540665,0.5525860411459335,-0.5931461271341023,-0.7281499783199061
0.5,406,-0.3010299956639812,0.0780619111709286,0.7806191117092867,-0.2476679411772941,0.5254758700249952,0.4745241299750048,-0.745442808747794,-0.9704250108400472
0.6,262,-0.2218487496163563,0.050374927898481,0.5037492789848106,-0.6856765970179299,0.5758507979234763,0.4241492020765237,-0.8576699939596247,-1.168378125959109
0.7,240,-0.1549019599857432,0.046144972120746,0.4614497212074601,-0.7733821774370355,0.6219957700442222,0.3780042299557777,-0.9728498930659606,-1.3357451000356422
0.8,271,-0.0969100130080563,0.052105364353009,0.5210536435300904,-0.6519022798993258,0.6741011343972313,0.3258988656027687,-1.1211681739585218,-1.480724967479859
0.9,145,-0.0457574905606751,0.0278792539896173,0.2787925398961738,-1.2772873583584523,0.7019803883868487,0.2980196116131512,-1.2105959838597804,-1.6086062735983122
1.0,210,0.0,0.0403768506056527,0.4037685060565276,-0.906913570061558,0.7423572389925015,0.2576427610074985,-1.3561813008281156,-1.723
