In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("GradientBooster").getOrCreate()

23/12/19 11:59:53 WARN Utils: Your hostname, perezs-zeenbook resolves to a loopback address: 127.0.1.1; using 192.168.7.202 instead (on interface wlo1)
23/12/19 11:59:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/19 11:59:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.csv("../data/realestate.csv", sep=",", inferSchema=True, header=True)

df = df.withColumn("TransactionYear", F.floor("TransactionDate")).drop(
    "TransactionDate"
)


df.show(5)

+---+--------+-------------+-----------------------+--------+---------+---------------+---------------+
| No|HouseAge|DistanceToMRT|NumberConvenienceStores|Latitude|Longitude|PriceOfUnitArea|TransactionYear|
+---+--------+-------------+-----------------------+--------+---------+---------------+---------------+
|  1|    32.0|     84.87882|                     10|24.98298|121.54024|           37.9|           2012|
|  2|    19.5|     306.5947|                      9|24.98034|121.53951|           42.2|           2012|
|  3|    13.3|     561.9845|                      5|24.98746|121.54391|           47.3|           2013|
|  4|    13.3|     561.9845|                      5|24.98746|121.54391|           54.8|           2013|
|  5|     5.0|     390.5684|                      5|24.97937|121.54245|           43.1|           2012|
+---+--------+-------------+-----------------------+--------+---------+---------------+---------------+
only showing top 5 rows



In [10]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


# train test split
label_column = "PriceOfUnitArea"
feature_columns = [col for col in df.columns if col != label_column]
df_ = df.withColumnRenamed(label_column, "label")
train, test = df_.randomSplit([0.8, 0.2], seed=42)

# Create model pipeline
# The regressor should be fed a 'features' column containing a vector with all the features
# and a 'label' column
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="x_vectorized")
std_scaler = StandardScaler(inputCol="x_vectorized", outputCol="features")
g_booster = GBTRegressor(seed=42)
pipe = Pipeline(stages=[vector_assembler, std_scaler, g_booster])

# param grid for the regressor
paramGrid = (
    ParamGridBuilder()
    .addGrid(g_booster.maxDepth, [2, 3, 4])
    .addGrid(g_booster.maxIter, [10, 20, 30])
    .build()
)

# create CrossValidator object
rmse_evaluator = RegressionEvaluator(metricName="rmse")
crossval = CrossValidator(
    estimator=pipe,
    estimatorParamMaps=paramGrid,
    evaluator=rmse_evaluator,
    numFolds=10,
    seed=42,
)  # use 3+ folds in practice

# Fit CrossValidator
cvModel = crossval.fit(train)

In [11]:
import numpy as np

# print best params
print("Best params: ")
for param_info, param_value in cvModel.getEstimatorParamMaps()[
    np.argmin(cvModel.avgMetrics)
].items():
    param_name = param_info.name
    print(param_name, param_value)
print()  # prints \n

# print evaluations metrics
cv_error = min(cvModel.avgMetrics)
print("Cross Validation Error: ", round(cv_error, 5))
training_error = rmse_evaluator.evaluate(cvModel.transform(train))
print("Training Error: ", round(training_error, 5))

Best params: 
maxDepth 2
maxIter 30

Cross Validation Error:  7.23166
Training Error:  6.44899


In [12]:
# print test error
test_error = rmse_evaluator.evaluate(cvModel.transform(test))
print("Test Error: ", round(test_error, 5))

Test Error:  7.45349


In [6]:
spark.stop()