In [1]:
from pyspark.sql import functions as F

In [2]:
df = spark.read.options(header=True,inferSchema=True)\
            .csv("hdfs://localhost:9000/user/local/hadoop_tmp/hdfs/data/boston/housing.csv")

In [3]:
def dfShape(df):
    return (df.count(),len(df.columns))

In [4]:
dfShape(df)

(489, 4)

In [5]:
df.show(5)

+-----+-----+-------+--------+
|   RM|LSTAT|PTRATIO|    MEDV|
+-----+-----+-------+--------+
|6.575| 4.98|   15.3|504000.0|
|6.421| 9.14|   17.8|453600.0|
|7.185| 4.03|   17.8|728700.0|
|6.998| 2.94|   18.7|701400.0|
|7.147| 5.33|   18.7|760200.0|
+-----+-----+-------+--------+
only showing top 5 rows



In [6]:
features = df.select('MEDV')
features.agg(F.mean(features['MEDV']),F.min(features['MEDV']),F.max(features['MEDV'])).show()

+-----------------+---------+---------+
|        avg(MEDV)|min(MEDV)|max(MEDV)|
+-----------------+---------+---------+
|454342.9447852761| 105000.0|1024800.0|
+-----------------+---------+---------+



In [22]:
df.describe().toPandas()

Unnamed: 0,summary,RM,LSTAT,PTRATIO,MEDV
0,count,489.0,489.0,489.0,489.0
1,mean,6.240288343558291,12.939631901840492,18.51656441717792,454342.9447852761
2,stddev,0.6436497627572434,7.081989789065132,2.111267502630993,165340.2776526678
3,min,3.561,1.98,12.6,105000.0
4,max,8.398,37.97,22.0,1024800.0


### Correlation between colums

In [7]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to MEDV for ", i, df.stat.corr('MEDV',i))

Correlation to MEDV for  RM 0.6972092210185153
Correlation to MEDV for  LSTAT -0.7606700599726771
Correlation to MEDV for  PTRATIO -0.5190335035317891
Correlation to MEDV for  MEDV 1.0


In [8]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['RM', 'PTRATIO','LSTAT'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(df)
vhouse_df = vhouse_df.select(['features', 'MEDV'])
vhouse_df.show(3)

+-----------------+--------+
|         features|    MEDV|
+-----------------+--------+
|[6.575,15.3,4.98]|504000.0|
|[6.421,17.8,9.14]|453600.0|
|[7.185,17.8,4.03]|728700.0|
+-----------------+--------+
only showing top 3 rows



### Split Test Train

In [9]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
train_df, test_df = vhouse_df.randomSplit([0.8, 0.2], seed=12345)

In [36]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

In [40]:
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

### Linear Regression

In [13]:
lr = LinearRegression(featuresCol = 'features', labelCol='MEDV', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [78314.13156753127,-20854.535137968243,-11125.031194918274]
Intercept: 496864.9292104244


In [14]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 87352.487652
r2: 0.712089


In [15]:
train_df.describe().show()

+-------+-----------------+
|summary|             MEDV|
+-------+-----------------+
|  count|              387|
|   mean|451586.8217054264|
| stddev|163007.6660105721|
|    min|         105000.0|
|    max|        1024800.0|
+-------+-----------------+



In [16]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","MEDV","features").show(5)

+------------------+--------+------------------+
|        prediction|    MEDV|          features|
+------------------+--------+------------------+
| 76919.74061009579|184800.0|[4.368,20.2,30.63]|
|125304.77728333254|220500.0|[4.652,20.2,28.28]|
|117127.82628462097|214200.0| [4.88,20.2,30.62]|
| 447978.3120163485|321300.0|[5.012,14.7,12.12]|
|  64995.0872057993|302400.0|[5.019,21.2,34.41]|
+------------------+--------+------------------+
only showing top 5 rows



### Test Evaluation

In [17]:
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MEDV",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R Squared (R2) on test data = 0.731085


In [18]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 89965.1


### Decision tree regression

In [33]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'MEDV')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="MEDV", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 86634.9


In [34]:
dt_predictions.select("prediction","MEDV","features").show(5)

+-----------------+--------+------------------+
|       prediction|    MEDV|          features|
+-----------------+--------+------------------+
|439090.9090909091|577500.0| [3.561,20.2,7.12]|
|         204600.0|184800.0|[4.368,20.2,30.63]|
|         204600.0|375900.0|[4.628,20.2,34.37]|
|         322140.0|247800.0|[4.903,14.7,29.29]|
|         204600.0|289800.0|[4.906,20.2,34.77]|
+-----------------+--------+------------------+
only showing top 5 rows

