In [1]:
rawTextRdd = sc.textFile('hdfs:/user/root/data')
print(rawTextRdd.take(5))

[u'AT\tV\tAP\tRH\tPE', u'14.96\t41.76\t1024.07\t73.17\t463.26', u'25.18\t62.96\t1020.04\t59.08\t444.37', u'5.11\t39.4\t1012.16\t92.14\t488.56', u'20.86\t57.32\t1010.24\t76.64\t446.48']


In [2]:
powerPlantDF = (sqlContext.read
                .format('com.databricks.spark.csv')
                .option('delimiter','\t')
                .option('header',True)
                .option('inferschema',True)
                .load('/user/root/data'))

In [3]:
print(powerPlantDF.dtypes)

[('AT', 'double'), ('V', 'double'), ('AP', 'double'), ('RH', 'double'), ('PE', 'double')]


In [4]:
from pyspark.sql.types import *
customSchema = StructType([\
                          StructField('AT', DoubleType(), True), \
                          StructField('V', DoubleType(), True), \
                          StructField('AP', DoubleType(), True), \
                          StructField('RH', DoubleType(), True), \
                          StructField('PE', DoubleType(), True)])
altPowerPlantDF = (sqlContext.read
                   .format('com.databricks.spark.csv')
                   .option('delimiter', '\t')
                   .option('header', True)
                   .load('/user/root/data', schema = customSchema))

In [5]:
print(altPowerPlantDF.dtypes)

[('AT', 'double'), ('V', 'double'), ('AP', 'double'), ('RH', 'double'), ('PE', 'double')]


In [6]:
sqlContext.sql('DROP TABLE IF EXISTS power_plant')
sqlContext.registerDataFrameAsTable(powerPlantDF, 'power_plant')

In [7]:
sqlContext.sql('SELECT * FROM power_plant').show()

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
|26.27|59.44|1012.23|58.77|443.67|
|15.89|43.96|1014.02|75.24|467.35|
| 9.48|44.71|1019.12|66.43|478.42|
|14.64| 45.0|1021.78|41.25|475.98|
|11.74|43.56|1015.14|70.72| 477.5|
|17.99|43.72|1008.64|75.04|453.02|
|20.14|46.93|1014.66|64.22|453.99|
|24.34| 73.5|1011.31|84.15|440.29|
|25.71|58.59|1012.77|61.83|451.28|
|26.19|69.34|1009.48|87.59|433.99|
|21.42|43.79|1015.76|43.08|462.19|
|18.21| 45.0|1022.86|48.84|467.54|
|11.04|41.74| 1022.6|77.51| 477.2|
|14.45|52.75|1023.97|63.59|459.85|
|13.97|38.47|1015.15|55.28| 464.3|
+-----+-----+-------+-----+------+
only showing top 20 rows



In [8]:
sqlContext.sql('DESC power_plant').show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|      AT|   double|   null|
|       V|   double|   null|
|      AP|   double|   null|
|      RH|   double|   null|
|      PE|   double|   null|
+--------+---------+-------+



In [9]:
df = sqlContext.table('power_plant')
df.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|                AT|                 V|                AP|                RH|                PE|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|             47840|             47840|             47840|             47840|             47840|
|   mean|19.651231187290996| 54.30580372073594|1013.2590781772572| 73.30897784280918|454.36500940635506|
| stddev| 7.452161658340004|12.707361709685806| 5.938535418520816|14.599658352081477| 17.06628146683769|
|    min|              1.81|             25.36|            992.89|             25.56|            420.26|
|    max|             37.11|             81.56|            1033.3|            100.16|            495.76|
+-------+------------------+------------------+------------------+------------------+------------------+



In [10]:
powerPlantDF.describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+
|summary|                AT|                 V|                AP|                RH|                PE|
+-------+------------------+------------------+------------------+------------------+------------------+
|  count|             47840|             47840|             47840|             47840|             47840|
|   mean|19.651231187290996| 54.30580372073594|1013.2590781772572| 73.30897784280918|454.36500940635506|
| stddev| 7.452161658340004|12.707361709685806| 5.938535418520816|14.599658352081477| 17.06628146683769|
|    min|              1.81|             25.36|            992.89|             25.56|            420.26|
|    max|             37.11|             81.56|            1033.3|            100.16|            495.76|
+-------+------------------+------------------+------------------+------------------+------------------+



In [11]:
from pyspark.ml.feature import VectorAssembler
datasetDF = sqlContext.table('power_plant')

vectorizer = VectorAssembler()
vectorizer.setInputCols(['AT','V','AP','RH','PE'])
vectorizer.setOutputCol('features')

VectorAssembler_48e4b81a405789fe3efa

In [12]:
seed = 1800009193L
(split20DF, split80DF) = datasetDF.randomSplit([0.2, 0.8],seed)
testSetDF = split20DF.cache()
trainSetDF = split80DF.cache()

In [13]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
from pyspark.ml import Pipeline

lr = LinearRegression()
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
maxIter: max number of iterations (>= 0). (default: 100)
predictionCol: prediction column name. (default: prediction)
regParam: regularization parameter (>= 0). (default: 0.0)
solver: the solver algorithm for optimization. If this is not set or empty, default value is 'auto'. (default: auto)
standardization: whether to standardize the training features before fitting the model. (default: True)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 1e-06)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [14]:
lr.setPredictionCol('Predicted_PE') \
  .setLabelCol('PE') \
  .setMaxIter(100) \
  .setRegParam(0.1)
    
lrPipeline = Pipeline()
lrPipeline.setStages([vectorizer, lr])
lrModel = lrPipeline.fit(trainSetDF)

In [15]:
intercept = lrModel.stages[1].intercept
weights = lrModel.stages[1].coefficients
featuresNoLabel = [col for col in datasetDF.columns if col != 'PE']
coefficients = zip(weights, featuresNoLabel)
coefficients.sort(key=lambda tup:abs(tup[0]), reverse=True)
equation = 'y = {intercept}'.format(intercept=intercept)
variables = []
for x in coefficients:
    weights = abs(x[0])
    name = x[1]
    symbol = '+' if (x[0] > 0) else '-'
    equation += ('{} ({} * {})'.format(symbol, weights, name))
    
print('Linear Regression Equation: ' + equation)

Linear Regression Equation: y = 31.3242038532- (0.137526662241 * AT)- (0.0183191257147 * V)- (0.0106030467712 * RH)+ (0.00568761368905 * AP)


In [16]:
predictionsAndLabelsDF = lrModel.transform(testSetDF).select('AT', 'V', 'AP', 'RH', 'PE', 'Predicted_PE')
predictionsAndLabelsDF.show()

+----+-----+-------+-----+------+------------------+
|  AT|    V|     AP|   RH|    PE|      Predicted_PE|
+----+-----+-------+-----+------+------------------+
|1.81|39.42|1026.92|76.97|490.55|490.71855974599214|
| 3.2|41.31| 997.67|98.84|489.86|  489.454047901701|
|3.38|41.31| 998.79|97.76|489.11| 488.7509457128594|
| 3.4|39.64| 1011.1|83.43|459.86| 461.8501608124722|
|3.51|35.47|1017.53|86.56|489.07| 489.0282619422462|
|3.63|38.44|1016.16|87.38|487.87| 487.8269943183257|
|3.91|35.47|1016.92|86.03|488.67|488.60411141713064|
|3.94| 39.9|1008.06|97.49|488.81| 488.4768802277945|
| 4.0| 39.9|1009.64|97.16|490.79| 490.3189997148778|
|4.15| 39.9|1007.62|95.69| 489.8|489.38352538876603|
|4.15| 39.9|1008.84|96.68|491.22| 490.6980468699973|
|4.23|38.44|1016.46|76.64| 489.0|488.90895566401764|
|4.24| 39.9|1009.28|96.74|491.25| 490.7153825899118|
|4.43|38.91|1019.04|88.17| 491.9| 491.4571139787961|
|4.44|38.44|1016.14|75.35|486.53|486.59921701959126|
|4.61|40.27|1012.32|77.28|492.85|492.366505406

In [17]:
from pyspark.ml.evaluation import RegressionEvaluator
regEval = RegressionEvaluator(predictionCol='Predicted_PE', labelCol='PE', metricName='rmse')
rmse = regEval.evaluate(predictionsAndLabelsDF)
print('Root Mean Squared Error: %.2f' % rmse)

Root Mean Squared Error: 0.33


In [18]:
r2 = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'r2'})
print('r2: {0:.2f}'.format(r2))

r2: 1.00


In [19]:
sqlContext.sql('DROP TABLE IF EXISTS Power_Plant_RMSE_Evalutation')
predictionsAndLabelsDF.selectExpr('PE', 'Predicted_PE', 'PE - Predicted_PE Residual_Error', '(PE - Predicted_PE) / {} Within_RSME'.format(rmse)).registerTempTable('Power_Plant_RMSE_Evaluation')

In [20]:
sqlContext.sql('SELECT * FROM Power_Plant_RMSE_Evaluation').show()

+------+------------------+--------------------+--------------------+
|    PE|      Predicted_PE|      Residual_Error|         Within_RSME|
+------+------------------+--------------------+--------------------+
|490.55|490.71855974599214|-0.16855974599212686| -0.5111964452428789|
|489.86|  489.454047901701|  0.4059520982989966|  1.2311437014091593|
|489.11| 488.7509457128594|  0.3590542871406228|  1.0889152339140062|
|459.86| 461.8501608124722| -1.9901608124721974|  -6.035623314507048|
|489.07| 489.0282619422462| 0.04173805775377559|  0.1265803209982806|
|487.87| 487.8269943183257|0.043005681674287644|   0.130424683946602|
|488.67|488.60411141713064| 0.06588858286937693| 0.19982237838973463|
|488.81| 488.4768802277945|  0.3331197722055208|  1.0102628144654047|
|490.79| 490.3189997148778|  0.4710002851222157|  1.4284173842674466|
| 489.8|489.38352538876603|  0.4164746112339799|  1.2630556574679737|
|491.22| 490.6980468699973|  0.5219531300027143|   1.582943680119478|
| 489.0|488.90895566

In [21]:
sqlContext.sql('SELECT case when Within_RSME <= 1.0 AND Within_RSME >= -1.0 then 1 \
                    when  Within_RSME <= 2.0 AND Within_RSME >= -2.0 then 2 else 3 \
                end RSME_Multiple, COUNT(*) AS count \
                FROM Power_Plant_RMSE_Evaluation \
                GROUP BY case when Within_RSME <= 1.0 AND Within_RSME >= -1.0 then 1  when  Within_RSME <= 2.0 AND Within_RSME >= -2.0 then 2 else 3 end').orderBy('count',ascending=False).show()

+-------------+-----+
|RSME_Multiple|count|
+-------------+-----+
|            1| 6498|
|            2| 2774|
|            3|  325|
+-------------+-----+



In [22]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
crossval = CrossValidator(estimator=lrPipeline, evaluator=regEval, numFolds=3)
regParam = [x / 100.0 for x in range(1,11)]

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, regParam)
             .build())
crossval.setEstimatorParamMaps(paramGrid)

cvModel = crossval.fit(trainSetDF).bestModel

In [23]:
predictionsAndLabelsDF = cvModel.transform(testSetDF)
regEval = RegressionEvaluator(predictionCol='Predicted_PE', labelCol='PE', metricName='rmse')
rmseNew = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'rmse'})
r2New = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'r2'})
print("Original Root Mean Squared Error: {0:2.2f}".format(rmse))
print('New Root Mean Squared Error: {0:2.2f}'.format(rmseNew))
print('Old r2: {0:2.2f}'.format(r2))
print('New r2: {0:2.2f}'.format(r2New))

Original Root Mean Squared Error: 0.33
New Root Mean Squared Error: 0.04
Old r2: 1.00
New r2: 1.00


In [24]:
print('Regularization parameter of the best model: {0:0.2f}'.format(cvModel.stages[-1]._java_obj.parent().getRegParam()))

Regularization parameter of the best model: 0.01


In [25]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol='indexedFeatures')
dt.setLabelCol('PE') \
  .setPredictionCol('Predicted_PE') \
  .setFeaturesCol('features') \
  .setMaxBins(100)
    
dtPipeline = Pipeline()
dtPipeline.setStages([vectorizer, dt])


Pipeline_436c8b80b3abe2884834

In [28]:
crossval.setEstimator(dtPipeline)
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 3])
             .build())
crossval.setEstimatorParamMaps(paramGrid)
dtModel = crossval.fit(trainSetDF).bestModel

In [29]:
predictionsAndLabelsDF = dtModel.transform(testSetDF).select('AT', 'V', 'AP', 'RH', 'PE', 'Predicted_PE')
rmseDT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'rmse'})
r2DT = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'r2'})
print('LR Root Mean Square Error: {0:.2f}'.format(rmseNew))
print('DT Root Mean Square Error: {0:.2f}'.format(rmseDT))
print('LR r2: {0:.2f}'.format(r2New))
print('DT r2: {0:.2f}'.format(r2DT))

LR Root Mean Square Error: 0.04
DT Root Mean Square Error: 2.40
LR r2: 1.00
DT r2: 0.98


In [30]:
print(dtModel.stages[-1]._java_obj.toDebugString())

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4537bd93b92c282504bd) of depth 3 with 15 nodes
  If (feature 4 <= 456.42)
   If (feature 4 <= 441.85)
    If (feature 4 <= 434.94)
     Predict: 431.3608518665608
    Else (feature 4 > 434.94)
     Predict: 438.4313747703612
   Else (feature 4 > 441.85)
    If (feature 4 <= 448.77)
     Predict: 445.0990078057242
    Else (feature 4 > 448.77)
     Predict: 452.40946752098984
  Else (feature 4 > 456.42)
   If (feature 4 <= 472.55)
    If (feature 4 <= 464.78)
     Predict: 460.9821208717142
    Else (feature 4 > 464.78)
     Predict: 468.5305376790533
   Else (feature 4 > 472.55)
    If (feature 4 <= 481.18)
     Predict: 476.68302154974776
    Else (feature 4 > 481.18)
     Predict: 485.6522113289762



In [34]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor()
rf.setLabelCol('PE') \
  .setPredictionCol('Predicted_PE') \
  .setFeaturesCol('features') \
  .setSeed(100088121L) \
  .setMaxDepth(8) \
  .setNumTrees(30)
    
rfPipeline = Pipeline()
rfPipeline.setStages([vectorizer, rf])

Pipeline_4dee8e87dee921841ff2

In [35]:
crossval.setEstimator(rfPipeline)
paramGrid = paramGrid = (ParamGridBuilder()
                         .addGrid(rf.maxBins, [50,100])
                         .build())
crossval.setEstimatorParamMaps(paramGrid)
rfModel = crossval.fit(trainSetDF).bestModel

In [36]:
predictionsAndLabelsDF = rfModel.transform(testSetDF)
rmseRF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'rmse'})
r2RF = regEval.evaluate(predictionsAndLabelsDF, {regEval.metricName: 'r2'})
print('LR Root Mean Square Error: {0:.2f}'.format(rmseNew))
print('DT Root Mean Square Error: {0:.2f}'.format(rmseDT))
print('RF Root Mean Square Error: {0:.2f}'.format(rmseRF))
print('LR r2: {0:.2f}'.format(r2New))
print('DT r2: {0:.2f}'.format(r2DT))
print('RF r2: {0:.2f}'.format(r2RF))

LR Root Mean Square Error: 0.04
DT Root Mean Square Error: 2.40
RF Root Mean Square Error: 0.72
LR r2: 1.00
DT r2: 0.98
RF r2: 1.00


In [38]:
print(rfModel.stages[-1]._java_obj.toDebugString())

RandomForestRegressionModel (uid=rfr_46cc0c8ae202) with 30 trees
  Tree 0 (weight 1.0):
    If (feature 0 <= 17.53)
     If (feature 0 <= 11.92)
      If (feature 3 <= 94.46)
       If (feature 4 <= 479.65)
        If (feature 1 <= 46.18)
         If (feature 2 <= 1026.23)
          If (feature 1 <= 44.2)
           If (feature 3 <= 92.85)
            Predict: 474.9973241650295
           Else (feature 3 > 92.85)
            Predict: 471.27209302325474
          Else (feature 1 > 44.2)
           If (feature 4 <= 473.01)
            Predict: 469.9552830188679
           Else (feature 4 > 473.01)
            Predict: 476.420569105691
         Else (feature 2 > 1026.23)
          If (feature 4 <= 474.93)
           If (feature 3 <= 87.97)
            Predict: 472.82949152542375
           Else (feature 3 > 87.97)
            Predict: 466.18600000000004
          Else (feature 4 > 474.93)
           If (feature 3 <= 68.97)
            Predict: 475.67272727272723
           Else (feature 3