In [1]:
from pyspark.sql import functions as F

In [2]:
df = spark.read.options(header=True,inferSchema=True)\
            .csv("hdfs://localhost:9000/user/local/hadoop_tmp/hdfs/data/boston/housing.csv")

In [3]:
def dfShape(df):
    return (df.count(),len(df.columns))

In [4]:
dfShape(df)

(489, 4)

In [5]:
df.show(5)

+-----+-----+-------+--------+
|   RM|LSTAT|PTRATIO|    MEDV|
+-----+-----+-------+--------+
|6.575| 4.98|   15.3|504000.0|
|6.421| 9.14|   17.8|453600.0|
|7.185| 4.03|   17.8|728700.0|
|6.998| 2.94|   18.7|701400.0|
|7.147| 5.33|   18.7|760200.0|
+-----+-----+-------+--------+
only showing top 5 rows



In [6]:
df.describe(['MEDV']).show()
#labels.agg(F.mean(labels['MEDV']),F.min(labels['MEDV']),F.max(labels['MEDV'])).show()

+-------+-----------------+
|summary|             MEDV|
+-------+-----------------+
|  count|              489|
|   mean|454342.9447852761|
| stddev|165340.2776526678|
|    min|         105000.0|
|    max|        1024800.0|
+-------+-----------------+



In [7]:
df.describe().toPandas()

Unnamed: 0,summary,RM,LSTAT,PTRATIO,MEDV
0,count,489.0,489.0,489.0,489.0
1,mean,6.240288343558291,12.939631901840492,18.51656441717792,454342.9447852761
2,stddev,0.6436497627572434,7.081989789065132,2.111267502630993,165340.2776526678
3,min,3.561,1.98,12.6,105000.0
4,max,8.398,37.97,22.0,1024800.0


### Correlation between colums

In [8]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to MEDV for ", i, df.stat.corr('MEDV',i))

Correlation to MEDV for  RM 0.6972092210185153
Correlation to MEDV for  LSTAT -0.7606700599726771
Correlation to MEDV for  PTRATIO -0.5190335035317891
Correlation to MEDV for  MEDV 1.0


In [9]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['RM', 'PTRATIO','LSTAT'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['features', 'MEDV'])
vhouse_df.show(3)

+-----------------+--------+
|         features|    MEDV|
+-----------------+--------+
|[6.575,15.3,4.98]|504000.0|
|[6.421,17.8,9.14]|453600.0|
|[7.185,17.8,4.03]|728700.0|
+-----------------+--------+
only showing top 3 rows



### Split Test Train & Grid Search

In [79]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit , CrossValidator

In [73]:
regParam = [x / 100.0 for x in range(1, 11)]

In [17]:
train_df, test_df = vhouse_df.randomSplit([0.8, 0.2], seed=12345)

In [75]:
lr = LinearRegression(featuresCol = 'features', labelCol='MEDV')

In [74]:
paramGrid = ParamGridBuilder()\
    .addGrid(lr.maxIter, [10,50,100]) \
    .addGrid(lr.regParam, regParam) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

In [76]:
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(predictionCol="prediction", \
                                 labelCol="MEDV",metricName="r2"),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

In [80]:
cv = CrossValidator(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(predictionCol="prediction", \
                                 labelCol="MEDV",metricName="r2"),
                           # 80% of the data will be used for training, 20% for validation.
                           numFolds=4)

In [91]:
model_cv = cv.fit(train_df)
model_tvs = tvs.fit(train_df)

In [93]:
model_tvs.validationMetrics

[0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182409854,
 0.5131185182

In [92]:
model_cv.avgMetrics

[0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324710533,
 0.6939121324

In [90]:
model.transform(test_df)\
    .select("features", "MEDV", "prediction")\
    .show()

+------------------+--------+------------------+
|          features|    MEDV|        prediction|
+------------------+--------+------------------+
|[4.368,20.2,30.63]|184800.0| 76918.89830818563|
|[4.652,20.2,28.28]|220500.0|125304.03720786475|
| [4.88,20.2,30.62]|214200.0|117127.08996457863|
|[5.012,14.7,12.12]|321300.0| 447978.3960759249|
|[5.019,21.2,34.41]|302400.0|64994.227507203934|
|[5.036,20.2,25.68]|203700.0| 184301.8723585916|
|[5.414,20.1,23.97]|147000.0|235013.98892759974|
|[5.468,14.7,26.42]|327600.0| 324601.5232483241|
|[5.531,20.2,27.38]|178500.0|204154.88523820415|
|[5.597,14.7,21.45]|323400.0| 389995.5700026052|
|[5.608,20.2,12.13]|585900.0|379842.09756461263|
|[5.682,17.9,10.21]|405300.0| 454963.0731951903|
|[5.707,19.2,12.01]|457800.0|409784.84002832876|
|[5.709,14.7,15.79]|407400.0| 461734.5573167448|
|[5.713,20.2,17.11]|317100.0| 332662.3554522417|
|[5.731,17.8,13.61]|405300.0| 423060.7684083177|
|[5.741,19.7,13.15]|392700.0|389337.66233480664|
|[5.786,17.9,14.15]|

In [89]:
model_cv.avgMetrics

AttributeError: 'LinearRegressionModel' object has no attribute 'avgMetrics'

In [28]:
tvs.extractParamMap()

{Param(parent='TrainValidationSplit_43e1a6b40849e3569f76', name='estimator', doc='estimator to be cross-validated'): LinearRegression_4e8a873151d9c76152e0,
 Param(parent='TrainValidationSplit_43e1a6b40849e3569f76', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='LinearRegression_4e8a873151d9c76152e0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
   Param(parent='LinearRegression_4e8a873151d9c76152e0', name='fitIntercept', doc='whether to fit an intercept term.'): False,
   Param(parent='LinearRegression_4e8a873151d9c76152e0', name='regParam', doc='regularization parameter (>= 0).'): 0.1},
  {Param(parent='LinearRegression_4e8a873151d9c76152e0', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
   Param(parent='LinearRegressio

### Linear Regression

In [22]:
lr = LinearRegression(featuresCol = ['RM', 'PTRATIO','LSTAT'], labelCol='MEDV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

TypeError: Invalid param value given for param "featuresCol". Could not convert <class 'list'> to string type

In [14]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 87352.487652
r2: 0.712089


In [15]:
train_df.describe().show()

+-------+-----------------+
|summary|             MEDV|
+-------+-----------------+
|  count|              387|
|   mean|451586.8217054264|
| stddev|163007.6660105721|
|    min|         105000.0|
|    max|        1024800.0|
+-------+-----------------+



In [16]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","MEDV","features").show(5)

+------------------+--------+------------------+
|        prediction|    MEDV|          features|
+------------------+--------+------------------+
| 76919.74061009579|184800.0|[4.368,20.2,30.63]|
|125304.77728333254|220500.0|[4.652,20.2,28.28]|
|117127.82628462097|214200.0| [4.88,20.2,30.62]|
| 447978.3120163485|321300.0|[5.012,14.7,12.12]|
|  64995.0872057993|302400.0|[5.019,21.2,34.41]|
+------------------+--------+------------------+
only showing top 5 rows



### Test Evaluation

In [17]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MEDV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R Squared (R2) on test data = 0.731085


In [18]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 89965.1


### Decision tree regression

In [33]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'MEDV')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="MEDV", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 86634.9


In [34]:
dt_predictions.select("prediction","MEDV","features").show(5)

+-----------------+--------+------------------+
|       prediction|    MEDV|          features|
+-----------------+--------+------------------+
|439090.9090909091|577500.0| [3.561,20.2,7.12]|
|         204600.0|184800.0|[4.368,20.2,30.63]|
|         204600.0|375900.0|[4.628,20.2,34.37]|
|         322140.0|247800.0|[4.903,14.7,29.29]|
|         204600.0|289800.0|[4.906,20.2,34.77]|
+-----------------+--------+------------------+
only showing top 5 rows

