In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('pyspark-training-spark-ml').master('local[2]').getOrCreate()

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=data_description.txt

In [25]:
import pyspark.sql.functions as f

In [51]:
df = spark.read.option('header', 'true').option('inferSchema', 'true').csv('D:\\data\\house-prices\\data')\
      .withColumn('MoSold', f.col('MoSold').cast('string')).withColumn('YrSold', f.col('YrSold').cast('string'))\
      .fillna('X').fillna(0)

In [52]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = false)
 |-- LotFrontage: string (nullable = false)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = false)
 |-- Alley: string (nullable = false)
 |-- LotShape: string (nullable = false)
 |-- LandContour: string (nullable = false)
 |-- Utilities: string (nullable = false)
 |-- LotConfig: string (nullable = false)
 |-- LandSlope: string (nullable = false)
 |-- Neighborhood: string (nullable = false)
 |-- Condition1: string (nullable = false)
 |-- Condition2: string (nullable = false)
 |-- BldgType: string (nullable = false)
 |-- HouseStyle: string (nullable = false)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = false)
 |-- RoofMatl: string (nullable = false)
 |-- Exterior1st: string (nulla

### Analyze data

In [29]:
df.count()

1460

In [50]:
df.show()

+---+----------+--------+-----------+-------+------+-----+--------+-----------+---------+---------+---------+------------+----------+----------+--------+----------+-----------+-----------+---------+------------+---------+--------+-----------+-----------+----------+----------+---------+---------+----------+--------+--------+------------+------------+----------+------------+----------+---------+-----------+-------+---------+----------+----------+--------+--------+------------+---------+------------+------------+--------+--------+------------+------------+-----------+------------+----------+----------+-----------+----------+-----------+------------+----------+----------+----------+----------+----------+----------+-----------+-------------+---------+-----------+--------+------+-----+-----------+-------+------+------+--------+-------------+---------+
| Id|MSSubClass|MSZoning|LotFrontage|LotArea|Street|Alley|LotShape|LandContour|Utilities|LotConfig|LandSlope|Neighborhood|Condition1|Condition

In [87]:
categorical_columns = []
numeric_cols = []

In [88]:
for col in df.drop('id').dtypes:
    dist_count = df.select(col[0]).distinct().count()
    if col[1] == 'string' and dist_count <= 20:
            categorical_columns.append((col[0], dist_count))
    if col[1] == 'int':
        numeric_cols.append((col[0], dist_count))

### Feature engineering

In [79]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler

#### one hot encoding of categorical variables

In [89]:
stages = []
indexers = [
     StringIndexer(inputCol=c[0], outputCol='{0}_indexed'.format(c[0]))
     for c in categorical_columns
   ]
input_cols = [indexer.getOutputCol() for indexer in indexers]
output_cols = ['{}_encoded'.format(indexer.getOutputCol()) for indexer in indexers]
encoders = OneHotEncoder(inputCols=input_cols, outputCols=output_cols)
stages += indexers
stages += [encoders]

#### label indexing

In [90]:
other_columns = ['Id', 'SalePrice']

#### assembling all the features using a vector assembler

In [92]:
assembler_input_cols = encoders.getOutputCols() + [c[0] for c in numeric_cols]
assembler = VectorAssembler(inputCols=assembler_input_cols, outputCol="vectorized_features")
stages += [assembler]

#### creating a pipeline to perform all the stages

In [93]:
pipeline = Pipeline().setStages(stages)
assembled = pipeline.fit(df)
df_assembled = assembled.transform(df)

In [95]:
df_vectorized = df_assembled.select('id', 'vectorized_features', 'SalePrice')

#### scaling the data with the help of a StandardScaler

In [96]:
scaler = StandardScaler(inputCol="vectorized_features", outputCol="features",
                        withStd=True, withMean=False)

In [97]:
scaler_model = scaler.fit(df_vectorized)

In [98]:
df_scaled = scaler_model.transform(df_vectorized).drop('vectorized_features')

In [99]:
df_scaled.show()

+---+---------+--------------------+
| id|SalePrice|            features|
+---+---------+--------------------+
|  1|   208500|(248,[0,4,5,7,10,...|
|  2|   181500|(248,[0,4,5,7,10,...|
|  3|   223500|(248,[0,4,5,8,10,...|
|  4|   140000|(248,[0,4,5,8,10,...|
|  5|   250000|(248,[0,4,5,8,10,...|
|  6|   143000|(248,[0,4,5,8,10,...|
|  7|   307000|(248,[0,4,5,7,10,...|
|  8|   200000|(248,[0,4,5,8,10,...|
|  9|   129900|(248,[1,4,5,7,10,...|
| 10|   118000|(248,[0,4,5,7,10,...|
| 11|   129500|(248,[0,4,5,7,10,...|
| 12|   345000|(248,[0,4,5,8,10,...|
| 13|   144000|(248,[0,4,5,9,10,...|
| 14|   279500|(248,[0,4,5,8,10,...|
| 15|   157000|(248,[0,4,5,8,10,...|
| 16|   132000|(248,[1,4,5,7,10,...|
| 17|   149000|(248,[0,4,5,8,10,...|
| 18|    90000|(248,[0,4,5,7,10,...|
| 19|   159000|(248,[0,4,5,7,10,...|
| 20|   139000|(248,[0,4,5,7,10,...|
+---+---------+--------------------+
only showing top 20 rows



In [100]:
df_train, df_test = df_scaled.randomSplit([0.7, 0.3], seed=42)

### Building the model

#### GBT

In [138]:
from pyspark.ml.regression import GBTRegressor

In [139]:
gbt = GBTRegressor(labelCol="SalePrice", featuresCol="features", maxIter=10)

In [140]:
model_gbt = gbt.fit(df_train)

In [167]:
df_test_pred = model_gbt.transform(df_test)

In [142]:
df_test_pred.show()

+---+---------+--------------------+------------------+
| id|SalePrice|            features|        prediction|
+---+---------+--------------------+------------------+
|  3|   223500|(248,[0,4,5,8,10,...|220640.24699448474|
|  7|   307000|(248,[0,4,5,7,10,...|308889.53814303805|
|  9|   129900|(248,[1,4,5,7,10,...|125480.20081940897|
| 10|   118000|(248,[0,4,5,7,10,...| 116740.6184230562|
| 14|   279500|(248,[0,4,5,8,10,...|275338.76993820764|
| 15|   157000|(248,[0,4,5,8,10,...|153250.34923845256|
| 16|   132000|(248,[1,4,5,7,10,...|133967.88312893853|
| 20|   139000|(248,[0,4,5,7,10,...|141951.47160832153|
| 22|   139400|(248,[1,4,6,7,11,...| 142642.9041373419|
| 24|   129900|(248,[1,4,5,7,10,...|126171.63334842934|
| 25|   154000|(248,[0,4,5,8,10,...|153699.69127334736|
| 29|   207500|(248,[0,4,5,8,10,...|209012.96758646137|
| 30|    68500|(248,[1,4,5,8,10,...| 61472.14922023655|
| 31|    40000|(248,[4,7,10,13,1...| 61472.14922023655|
| 33|   179900|(248,[0,4,5,7,10,...| 177763.7654

In [143]:
from pyspark.ml.evaluation import RegressionEvaluator

In [171]:
rmse_evaluator = RegressionEvaluator(
    labelCol="SalePrice", predictionCol="prediction", metricName="rmse")

In [172]:
r2_evaluator = RegressionEvaluator(
    labelCol="SalePrice", predictionCol="prediction", metricName="r2")

In [173]:
r2_evaluator.evaluate(df_test_pred)

0.9233182028059377

In [174]:
rmse_evaluator.evaluate(df_test_pred)

20595.511164465355

#### Random Forest

In [146]:
from pyspark.ml.regression import RandomForestRegressor

In [147]:
rf = RandomForestRegressor(labelCol="SalePrice", featuresCol="features", numTrees=10)

In [148]:
model_rf = rf.fit(df_train)

In [175]:
df_test_pred = model_rf.transform(df_test)

In [176]:
r2_evaluator.evaluate(df_test_pred)

0.9589008311603939

In [177]:
rmse_evaluator.evaluate(df_test_pred)

15077.972725251162

#### Decision Tree

In [151]:
from pyspark.ml.regression import DecisionTreeRegressor

In [152]:
dt = DecisionTreeRegressor(labelCol="SalePrice", featuresCol="features")

In [153]:
model_dt = dt.fit(df_train)

In [178]:
df_test_pred = model_dt.transform(df_test)

In [179]:
r2_evaluator.evaluate(df_test_pred)

0.926599674256011

In [180]:
rmse_evaluator.evaluate(df_test_pred)

20150.017491138606

#### Linear Regression

In [156]:
from pyspark.ml.regression import LinearRegression

In [157]:
lr = LinearRegression(labelCol="SalePrice", featuresCol="features")

In [158]:
model_lr = lr.fit(df_train)

In [181]:
df_test_pred = model_lr.transform(df_test)

In [182]:
r2_evaluator.evaluate(df_test_pred)

0.999988851644559

In [183]:
rmse_evaluator.evaluate(df_test_pred)

248.3315145272539