#### Data Description
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

#### Building A Machine Learning Models With Spark ML

* Linear Regression : https://en.wikipedia.org/wiki/Linear_regression, https://www.youtube.com/watch?v=zPG4NjIkCjc
* Decision trees    : http://www.r2d3.us/visual-intro-to-machine-learning-part-1/  (nice visualization for decision tree)
* Random forest     : https://en.wikipedia.org/wiki/Random_forest
* GB Decision Trees : https://en.wikipedia.org/wiki/Gradient_boosting
* Clustering        : https://spark.apache.org/docs/2.3.0/ml-clustering.html, https://www.datascience.com/blog/k-means-clustering
* Cross-validation  : https://en.wikipedia.org/wiki/Cross-validation_(statistics)
* Collaborative filtering: https://spark.apache.org/docs/2.3.0/mllib-collaborative-filtering.html, https://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/
* Metrics:
  * RMSE: https://en.wikipedia.org/wiki/Root-mean-square_deviation
  * R2:   https://en.wikipedia.org/wiki/Coefficient_of_determination

#### Data Loading

In [3]:
# import necessary libs
import numpy  as np
import pandas as pd

# general spark modules
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf, PandasUDFType #https://databricks.com/blog/2017/10/30/introducing-vectorized-udfs-for-pyspark.html

# spark ml modules 
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer

In [4]:
# read training/testing data as Spark dataframe
train = sqlContext.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/housing/train.csv')
test  = sqlContext.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/housing/test.csv')

In [5]:
train.printSchema()

In [6]:
# Show the top 20 rows 
train.select('1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'KitchenAbvGr', 'BedroomAbvGr', 'TotRmsAbvGrd', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'PoolArea', 'SalePrice').show(5)

In [7]:
# Write a custom function to convert the data type of DataFrame columns
def convertColumn(df, names, newType):
  for name in names: 
     df = df.withColumn(name, df[name].cast(newType))
  return df 

# Assign all column names to `columns`
columns = ['GrLivArea', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'KitchenAbvGr', 'BedroomAbvGr', 'TotRmsAbvGrd', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'PoolArea', 'SalePrice']

# Conver the `df` columns to `FloatType()`
train = convertColumn(train, columns, FloatType())

In [8]:
# lets calculate some basic statistics about data
train.select('1stFlrSF', '2ndFlrSF', 'SalePrice').describe().show()

#### Data Preprocessing

In [10]:
# conversion of target variable to improve stability of algorithms
@pandas_udf('double', PandasUDFType.SCALAR)
def log1p(v):
      return np.log1p(v)

train = train.withColumn('SalePriceLog', log1p(train.SalePrice))

In [11]:
display(train)

In [12]:
display(train)

In [13]:
## Select important columns
columns.append('SalePriceLog')
train = train.select(columns)

In [14]:
# Define the `input_data` 
input_data = train.rdd.map(lambda x: (DenseVector(x[1:-2]), x[-1]))

# Replace `df` with the new DataFrame
df_for_ml = spark.createDataFrame(input_data, ["features", "label"])

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df_for_ml)

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df_for_ml)

# Inspect the result
scaled_df = scaled_df.select('label', col('features_scaled').alias("features"))

In [15]:
scaled_df.rdd.take(10)

In [16]:
# Split the data into train and test sets
train_data, test_data = scaled_df.randomSplit([0.8,  0.2],  seed=1234)

##### Linear Regression

In [18]:
# Initializing of Linear Regression
lr = LinearRegression(labelCol="label", maxIter=10000, regParam=0.2, elasticNetParam=0.5)

# Fit the data to the model
linearModel = lr.fit(train_data)

# Summarize the model over the training set and print out some metrics
trainingSummary = linearModel.summary
print("RMSE train: %f" % trainingSummary.rootMeanSquaredError)
print("r2   train: %f" % trainingSummary.r2)

# Generate predictions
predicted = linearModel.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predicted)
r2   = evaluator_r2.evaluate(predicted)

print("\nRMSE test: %f" % rmse)
print("r2   test: %f" % r2)

##### Linear Regression with Cross-Validation

In [20]:
# Initializinf of Linear Regression
lr = LinearRegression(labelCol="label")

# let's set desired parameters
paramGrid = ParamGridBuilder()\
.addGrid(lr.regParam,        [0.2, 0.3, 0.5,  0.7])\
.addGrid(lr.elasticNetParam, [0.2, 0.5,  0.7, 0.8])\
.addGrid(lr.maxIter,         [100, 1000, 5000, 10000])\
.build()

# cross-validation settings
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3,
                          seed=2018
                         )  

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_data)

# Summarize the model over the training set and print out some metrics
trainingSummary = cvModel.bestModel.summary
print("RMSE train: %f" % trainingSummary.rootMeanSquaredError)
print("r2   train: %f" % trainingSummary.r2)

# Generate predictions
predicted = cvModel.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predicted)
r2   = evaluator_r2.evaluate(predicted)

print("\nRMSE test: %f" % rmse)
print("r2   test: %f" % r2)

#### Decision-Tree with Cross-Validation

In [22]:
# Automatically identify categorical features, and index them.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(scaled_df)

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

paramGrid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [3,  10, 25])\
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)  

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_data)

# Generate predictions
predicted = cvModel.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predicted)
r2   = evaluator_r2.evaluate(predicted)

print("\nRMSE test: %f" % rmse)
print("r2   test: %f" % r2)

In [23]:
tree = cvModel.bestModel.stages[1]

In [24]:
tree.featureImportances

#### Random forest

In [26]:
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(scaled_df)

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(train_data)

# Generate predictions
predicted = model.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predicted)
r2   = evaluator_r2.evaluate(predicted)

print("\nRMSE test: %f" % rmse)
print("r2   test: %f" % r2)

In [27]:
# additonal outcome from trees is feature importances (can be used for feature selection)
rf_model = model.stages[1]
print(rf_model.featureImportances)

#### Gradient-boosted tree regression

In [29]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit(scaled_df)

# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(train_data)

# Generate predictions
predicted = model.transform(test_data)

# Select (prediction, true label) and compute test error
evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator_r2   = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator_rmse.evaluate(predicted)
r2   = evaluator_r2.evaluate(predicted)

print("\nRMSE test: %f" % rmse)
print("r2   test: %f" % r2)

#### Clustering

In [31]:
# https://rsandstroem.github.io/sparkkmeans.html
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(scaled_df)

# Make predictions
predictions = model.transform(scaled_df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [32]:
display(model, scaled_df)