In [0]:
edgesDF = (
  sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=',', header='true', inferschema='true')
    .load("/FileStore/tables/comp4651-project/musae_squirrel_edges.csv")
)

degDF = (
  edgesDF
    .select('id1')
    .union(edgesDF.select('id2'))
    .groupBy('id1')
    .count()
    .toDF('id', 'degree')
)

trafficDF = (
    sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=',', header='true', inferschema='true')
    .load("/FileStore/tables/comp4651-project/musae_squirrel_target.csv")
)

distDF = (
    sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=' ', header='false', inferschema='true')
    .load("/FileStore/tables/comp4651-project/output/part-00000")
    .selectExpr('_c0 as id', '_c1 as dist')
)

pageRankDF = (
    sqlContext.read
    .format('com.databricks.spark.csv')
    .options(delimiter=' ', header='false', inferschema='true')
    .load("/FileStore/tables/comp4651-project/output2/part-00000")
    .selectExpr('_c0 as id', '_c1 as rank')
)

ccDF = (
   sqlContext.read
  .format('com.databricks.spark.csv')
  .options(delimiter=' ', header='false', inferschema='true')
  .load("/FileStore/tables/comp4651-project/output1/part-00000")
  .selectExpr('_c0 as id', '_c1 as cc')
)


In [0]:
from pyspark.sql.functions import expr
nodeDF = degDF.join(trafficDF, ['id']).join(distDF, ['id']).join(pageRankDF, ['id']).join(ccDF, ['id']).withColumn('closeness', expr('1/dist')).cache()

In [0]:
display(nodeDF)

id,degree,target,dist,rank,cc,closeness
4935,428,3173,2.651797731205537,2.727547480521806,0.2297790496618442,0.3771026682134571
4101,10,29367,3.576619880792155,0.7743373263671957,0.0111111111111111,0.2795935920868724
1959,120,21075,2.6516054604883674,0.8992977500702961,0.4025560224089636,0.3771300123268798
1829,17,66,3.1567006344933666,0.2582510915060318,0.1139705882352941,0.3167864538920696
3749,421,487,2.5721976542972507,2.67729724665704,0.2496861214794706,0.3887726117506354
2659,5,25639,3.0696019996154584,0.3276410416106629,0.25,0.3257751331036643
1088,3,9410,3.744087675447029,0.242535716845384,0.25,0.2670877625430082
3918,11,434,3.1234378004229955,0.3287670087519936,0.1863636363636363,0.3201600492459218
148,184,222045,2.472601422803307,1.4618800627478894,0.214807555238774,0.4044323483670295
1645,32,43592,2.85445106710248,0.5130127371845676,0.3729838709677419,0.350330055233733


In [0]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

vectorizer = VectorAssembler()
vectorizer.setInputCols(["degree", "dist", "rank", "closeness", "cc"])
vectorizer.setOutputCol("features")

regEval = RegressionEvaluator(predictionCol="Prediction_target", labelCol="target", metricName="rmse")

(split15DF, split85DF) = nodeDF.randomSplit([0.15, 0.85])

# Let's cache these datasets for performance
testSetDF = split15DF.cache()
trainingSetDF = split85DF.cache()

# Create a RandomForestRegressor
rf = RandomForestRegressor()

(rf
  .setPredictionCol("Prediction_target")
  .setLabelCol("target")
  .setFeaturesCol("features")
  .setMaxDepth(8)
  .setNumTrees(25)
)

# Create a Pipeline
rfPipeline = Pipeline()

# Set the stages of the Pipeline
rfPipeline.setStages([vectorizer, rf])

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
crossval = CrossValidator()
(crossval
  .setEstimator(rfPipeline)
  .setEvaluator(regEval)
  .setNumFolds(3)
)

# Let's tune over our rf.maxBins parameter on the values 50 and 100, create a paramter grid using the ParamGridBuilder
paramGrid = (
  ParamGridBuilder()
    .addGrid(rf.maxBins, [50,100])
    .build()
)

# Add the grid to the CrossValidator
crossval.setEstimatorParamMaps(paramGrid)

# Now let's find and return the best model
rfModel = crossval.fit(trainingSetDF).bestModel

In [0]:
resultsDF = rfModel.transform(testSetDF)

# Run the previously created RMSE evaluator, regEval, on the resultsDF DataFrame
rmseRF = regEval.evaluate(resultsDF)

# Now let's compute the r2 evaluation metric for our test dataset
r2RF = regEval.evaluate(resultsDF, {regEval.metricName: 'r2'})

print("RF Root Mean Squared Error: {0:.2f}".format(rmseRF))
print("RF r2: {0:.2f}".format(r2RF))