In [1]:
from pyspark.sql import functions as F

In [2]:
df = spark.read.options(header=True,inferSchema=True)\
            .csv("hdfs://localhost:9000/user/local/hadoop_tmp/hdfs/data/boston/housing.csv")

In [3]:
def dfShape(df):
    return (df.count(),len(df.columns))

In [4]:
dfShape(df)

(489, 4)

In [5]:
df.show(5)

+-----+-----+-------+--------+
|   RM|LSTAT|PTRATIO|    MEDV|
+-----+-----+-------+--------+
|6.575| 4.98|   15.3|504000.0|
|6.421| 9.14|   17.8|453600.0|
|7.185| 4.03|   17.8|728700.0|
|6.998| 2.94|   18.7|701400.0|
|7.147| 5.33|   18.7|760200.0|
+-----+-----+-------+--------+
only showing top 5 rows



In [6]:
df.describe(['MEDV']).show()
#labels.agg(F.mean(labels['MEDV']),F.min(labels['MEDV']),F.max(labels['MEDV'])).show()

+-------+-----------------+
|summary|             MEDV|
+-------+-----------------+
|  count|              489|
|   mean|454342.9447852761|
| stddev|165340.2776526678|
|    min|         105000.0|
|    max|        1024800.0|
+-------+-----------------+



In [7]:
df.describe().toPandas()

Unnamed: 0,summary,RM,LSTAT,PTRATIO,MEDV
0,count,489.0,489.0,489.0,489.0
1,mean,6.240288343558291,12.939631901840492,18.51656441717792,454342.9447852761
2,stddev,0.6436497627572434,7.081989789065132,2.111267502630993,165340.2776526678
3,min,3.561,1.98,12.6,105000.0
4,max,8.398,37.97,22.0,1024800.0


### Correlation between colums

In [8]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to MEDV for ", i, df.stat.corr('MEDV',i))

Correlation to MEDV for  RM 0.6972092210185153
Correlation to MEDV for  LSTAT -0.7606700599726771
Correlation to MEDV for  PTRATIO -0.5190335035317891
Correlation to MEDV for  MEDV 1.0


In [9]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['RM', 'PTRATIO','LSTAT'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['features', 'MEDV'])
vhouse_df.show(3)

+-----------------+--------+
|         features|    MEDV|
+-----------------+--------+
|[6.575,15.3,4.98]|504000.0|
|[6.421,17.8,9.14]|453600.0|
|[7.185,17.8,4.03]|728700.0|
+-----------------+--------+
only showing top 3 rows



### Split Test Train & Grid Search

In [10]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit , CrossValidator

In [34]:
regParam = [x / 100.0 for x in range(1, 11)]
elasticNetParam = [x / 10.0 for x in range(0, 10)]
maxIter = [10,20,30,40,50,60,70,80,90,100]

In [35]:
train_df, test_df = vhouse_df.randomSplit([0.8, 0.2], seed=12345)

In [36]:
lr = LinearRegression(featuresCol = 'features', labelCol='MEDV')

In [37]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.maxIter, maxIter) \
    .addGrid(lr.regParam, regParam) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, elasticNetParam)\
    .build()

In [38]:
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(predictionCol="prediction", \
                                 labelCol="MEDV",metricName="r2"),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [39]:
cv = CrossValidator(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(predictionCol="prediction", \
                                 labelCol="MEDV",metricName="r2"),
                           # 80% of the data will be used for training, 20% for validation.
                           numFolds=4)

In [41]:
model_cv = cv.fit(train_df)
print("Max iteration  : %g " %model_cv.bestModel._java_obj.getMaxIter())
print("Reg  : %g " %model_cv.bestModel._java_obj.getRegParam())
print("Elastic net Param : %g " %model_cv.bestModel._java_obj.getElasticNetParam())
print("r2 : %g " % model_cv.bestModel.summary.r2)
print("RMSE : %g " % model_cv.bestModel.summary.rootMeanSquaredError)

Max iteration  : 10 
Reg  : 0.01 
Elastic net Param : 0.1 
r2 : 0.712085 
RMSE : 87353 


In [43]:
model_cv.bestModel.transform(test_df)\
    .select("features", "MEDV", "prediction")\
    .show()

+------------------+--------+------------------+
|          features|    MEDV|        prediction|
+------------------+--------+------------------+
|[4.368,20.2,30.63]|184800.0|  76109.8764788371|
|[4.652,20.2,28.28]|220500.0|124609.12927582697|
| [4.88,20.2,30.62]|214200.0|116556.30454812758|
|[5.012,14.7,12.12]|321300.0| 447594.3364285894|
|[5.019,21.2,34.41]|302400.0| 64468.64070087747|
|[5.036,20.2,25.68]|203700.0|183765.70096763282|
|[5.414,20.1,23.97]|147000.0|234645.47210200696|
|[5.468,14.7,26.42]|327600.0| 324539.7029210242|
|[5.531,20.2,27.38]|178500.0| 203862.1903438287|
|[5.597,14.7,21.45]|323400.0|389955.56564486324|
|[5.608,20.2,12.13]|585900.0|379467.94412209874|
|[5.682,17.9,10.21]|405300.0|454720.49344761483|
|[5.707,19.2,12.01]|457800.0| 409504.4950281492|
|[5.709,14.7,15.79]|407400.0| 461703.1488986014|
|[5.713,20.2,17.11]|317100.0| 332375.3795964286|
|[5.731,17.8,13.61]|405300.0|422872.01295196696|
|[5.741,19.7,13.15]|392700.0|389057.58571744815|
|[5.786,17.9,14.15]|

In [45]:
model_cv.avgMetrics

[0.6618622540030612,
 0.19916570386400564,
 0.19916570255574637,
 0.19916570124748723,
 0.19916569993922797,
 0.1991656986309686,
 0.19916569732270956,
 0.19916569601445153,
 0.1991656947061921,
 0.19916569339793372,
 0.6939121344307069,
 0.6941305072585557,
 0.6935410650547306,
 0.6938491327907486,
 0.6937867263443125,
 0.693786726210195,
 0.6937867260760775,
 0.6937867259419433,
 0.6937867258078361,
 0.6938785635327135,
 0.661862255621605,
 0.19916570193173136,
 0.1991656993152122,
 0.19916569669869424,
 0.19916569408217688,
 0.1991656914656601,
 0.19916568884914312,
 0.1991656862326272,
 0.1991656836161115,
 0.1991656809995964,
 0.6939121363903578,
 0.6935410672013673,
 0.6937867283108919,
 0.6937867280426598,
 0.6937867277744089,
 0.6939010035764551,
 0.6940064729336122,
 0.6940064726745444,
 0.6939337182328892,
 0.6939337179799903,
 0.6618622572401476,
 0.19916569999945619,
 0.19916569607467885,
 0.19916569214990226,
 0.1991656882251272,
 0.1991656843003532,
 0.1991656803755798,
 

In [42]:
model_tvs = tvs.fit(train_df)
print("Max iteration  : %g " %model_tvs.bestModel._java_obj.getMaxIter())
print("Reg  : %g " %model_tvs.bestModel._java_obj.getRegParam())
print("Elastic net Param : %g " %model_tvs.bestModel._java_obj.getElasticNetParam())
print("r2 : %g " % model_tvs.bestModel.summary.r2)
print("RMSE : %g " % model_tvs.bestModel.summary.rootMeanSquaredError)

Max iteration  : 10 
Reg  : 0.07 
Elastic net Param : 0.1 
r2 : 0.71208 
RMSE : 87353.8 


In [44]:
model_tvs.bestModel.transform(test_df)\
    .select("features", "MEDV", "prediction")\
    .show()

+------------------+--------+------------------+
|          features|    MEDV|        prediction|
+------------------+--------+------------------+
|[4.368,20.2,30.63]|184800.0| 78118.47277935286|
|[4.652,20.2,28.28]|220500.0|126342.06004970887|
| [4.88,20.2,30.62]|214200.0|118324.87428180943|
|[5.012,14.7,12.12]|321300.0|447929.92387361615|
|[5.019,21.2,34.41]|302400.0| 66447.45132532733|
|[5.036,20.2,25.68]|203700.0|185161.00657040492|
|[5.414,20.1,23.97]|147000.0|235755.58030992572|
|[5.468,14.7,26.42]|327600.0|  325531.394695051|
|[5.531,20.2,27.38]|178500.0|205129.37445671618|
|[5.597,14.7,21.45]|323400.0|390584.79509729904|
|[5.608,20.2,12.13]|585900.0|  379772.032233943|
|[5.682,17.9,10.21]|405300.0|454770.13610121654|
|[5.707,19.2,12.01]|457800.0| 409710.1594425755|
|[5.709,14.7,15.79]|407400.0|461935.91846787545|
|[5.713,20.2,17.11]|317100.0|332933.11615758983|
|[5.731,17.8,13.61]|405300.0|423100.95442548033|
|[5.741,19.7,13.15]|392700.0|389339.07235469273|
|[5.786,17.9,14.15]|

In [46]:
model_tvs.validationMetrics

[0.38563589706854573,
 0.006572126590778438,
 0.006572123805872243,
 0.006572121020971489,
 0.006572118236068847,
 0.006572115451164762,
 0.006572112666268781,
 0.006572109881369581,
 0.006572107096464275,
 0.00657210431156674,
 0.5131185263856959,
 0.5128004786311163,
 0.5131444821928804,
 0.5131444840162747,
 0.5131444858397518,
 0.5131444876631843,
 0.5131444894866215,
 0.5131444913101126,
 0.5131168735977578,
 0.5131168754167843,
 0.38563590782500323,
 0.006572123958677345,
 0.00657211838886862,
 0.006572112819067,
 0.00657210724926327,
 0.00657210167946809,
 0.006572096109666248,
 0.00657209053986485,
 0.006572084970066894,
 0.006572079400264941,
 0.5131185345303937,
 0.5131444903237172,
 0.513144493970682,
 0.5131444976174844,
 0.5131168817427509,
 0.5131168853808117,
 0.5131168890188836,
 0.5131168926569452,
 0.5131168962949867,
 0.5131168999330591,
 0.38563591858147395,
 0.006572121326575697,
 0.006572112971872435,
 0.006572104617166841,
 0.006572096262466354,
 0.00657208790776

In [28]:
tvs.extractParamMap()

{Param(parent='TrainValidationSplit_43e1a6b40849e3569f76', name='estimator', doc='estimator to be cross-validated'): LinearRegression_4e8a873151d9c76152e0,
 Param(parent='TrainValidationSplit_43e1a6b40849e3569f76', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='LinearRegression_4e8a873151d9c76152e0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
   Param(parent='LinearRegression_4e8a873151d9c76152e0', name='fitIntercept', doc='whether to fit an intercept term.'): False,
   Param(parent='LinearRegression_4e8a873151d9c76152e0', name='regParam', doc='regularization parameter (>= 0).'): 0.1},
  {Param(parent='LinearRegression_4e8a873151d9c76152e0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
   Param(parent='LinearRegressio

### Linear Regression

In [22]:
lr = LinearRegression(featuresCol = ['RM', 'PTRATIO','LSTAT'], labelCol='MEDV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

TypeError: Invalid param value given for param "featuresCol". Could not convert <class 'list'> to string type

In [14]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 87352.487652
r2: 0.712089


In [15]:
train_df.describe().show()

+-------+-----------------+
|summary|             MEDV|
+-------+-----------------+
|  count|              387|
|   mean|451586.8217054264|
| stddev|163007.6660105721|
|    min|         105000.0|
|    max|        1024800.0|
+-------+-----------------+



In [16]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","MEDV","features").show(5)

+------------------+--------+------------------+
|        prediction|    MEDV|          features|
+------------------+--------+------------------+
| 76919.74061009579|184800.0|[4.368,20.2,30.63]|
|125304.77728333254|220500.0|[4.652,20.2,28.28]|
|117127.82628462097|214200.0| [4.88,20.2,30.62]|
| 447978.3120163485|321300.0|[5.012,14.7,12.12]|
|  64995.0872057993|302400.0|[5.019,21.2,34.41]|
+------------------+--------+------------------+
only showing top 5 rows



### Test Evaluation

In [17]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MEDV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R Squared (R2) on test data = 0.731085


In [18]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 89965.1


### Decision tree regression

In [33]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'MEDV')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="MEDV", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 86634.9


In [34]:
dt_predictions.select("prediction","MEDV","features").show(5)

+-----------------+--------+------------------+
|       prediction|    MEDV|          features|
+-----------------+--------+------------------+
|439090.9090909091|577500.0| [3.561,20.2,7.12]|
|         204600.0|184800.0|[4.368,20.2,30.63]|
|         204600.0|375900.0|[4.628,20.2,34.37]|
|         322140.0|247800.0|[4.903,14.7,29.29]|
|         204600.0|289800.0|[4.906,20.2,34.77]|
+-----------------+--------+------------------+
only showing top 5 rows

