# 10 Machine Learning with MLlib

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ml-test").getOrCreate()
spark

In [None]:
airbnbDF = spark.read.format("parquet").load("data/sf-airbnb/sf-airbnb-clean.parquet")
airbnbDF.printSchema()

In [None]:
airbnbDF.columns
airbnbDF.select("neighbourhood_cleansed", "room_type", "bedrooms", "bathrooms", "number_of_reviews", "price").show(5)
airbnbDF.count()

Since executors perform their own partitions, setting up `seed` is not enought for generating the same split. For this reason `trainDF` and `testDF` must be save after the split and used from reloading them.

In [None]:
trainDF, testDF = airbnbDF.randomSplit([0.8, 0.2], seed=1337)

f"Train data size {trainDF.count()} and test size {testDF.count()}"

In [None]:
trainDF.cache()

In [None]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(inputCols=["bedrooms"], outputCol="features")
vec_train_df = vec_assembler.transform(trainDF)
vec_test_df = vec_assembler.transform(testDF)

vec_train_df.select("bedrooms", "features", "price").show()
vec_train_df.corr("bedrooms", "price")

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="price")
lr_model = lr.fit(vec_train_df)
lr_model

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[vec_assembler, lr])
pipeline_model = pipeline.fit(trainDF)
pipeline_model

In [None]:
predDF = pipeline_model.transform(testDF)
predDF.select("bedrooms", "features", "price", "prediction").show(5)

So far we have only used numerical features. In order to use categorical variables without introducing and inherent order, we can use `StringIndexer` and `OneHotEncoder`. An important aspect is handling the values of categorical variables not present in train data. We can explicitly decide how to handle these cases with `handleInvalid` parameter of `StringIndexer`:
- `'error'`: default value, throw an error
- `'skip'`: skip data points not present in labels
- `'keep'`: assign the last value of the index. If the cardinality of categorical variable is n, then indices are $0, \dots, n-1$. Unknown values are indexed as $n$.

In [None]:
from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol="spec", outputCol="features", handleInvalid="keep")

train = spark.createDataFrame(
    data=[
        [0, "cat"],
        [0, "dog"],
        [0, "cat"],
        [0, "cat"],
    ],
    schema=["id", "spec"],
)

test = spark.createDataFrame(
    data=[
        [0, "cat"],
        [0, "dog"],
        [0, "cat"],
        [0, "bird"],
    ],
    schema=["id", "spec"],
)
string_indexer = string_indexer.fit(train)
test = string_indexer.transform(test)
# test.show()
test.show()

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for field, dtype in trainDF.dtypes if dtype == "string"]

indexOutputCols = [field + "Index" for field in categoricalCols]
oheOutputCols = [field + "OHE" for field in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols, outputCols=indexOutputCols, handleInvalid="keep")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

numericalCols = [field for field, dtype in trainDF.dtypes if (dtype == "double") & (field != "price")]

assemblerInputs = oheOutputCols + numericalCols

vec_assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")


In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol="features", labelCol="price")

pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vec_assembler, lr])

In [None]:
pipelineModel = pipeline.fit(trainDF)
predDf = pipelineModel.transform(testDF)
predDf.select("features", "price", "prediction").show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")
rmse = regressionEvaluator.evaluate(predDF)
print(f"RMSE: {rmse}")
regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="r2")
r2 = regressionEvaluator.evaluate(predDF)
print(f"r2: {r2}")


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit, mean

price_avg = trainDF.select(mean(col("price")).alias("avg_price")).toPandas()["avg_price"].item()
predDf = predDF.withColumn("avg_prediction", lit(price_avg))
predDf.select("prediction", "avg_prediction").show()


regressionEvaluator = RegressionEvaluator(predictionCol="avg_prediction", labelCol="price", metricName="rmse")
rmse = regressionEvaluator.evaluate(predDf)
print(f"RMSE: {rmse}")

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

regressionEvaluator = RegressionEvaluator(predictionCol="avg_prediction", labelCol="price", metricName="rmse")
rmse = regressionEvaluator.evaluate(predDf)
print(f"RMSE: {rmse}")

In [None]:
df = trainDF.select("price").toPandas()
df

In [None]:
df.dtypes

In [None]:
df = df.sort_values("price")

In [None]:
df.sort_values("price").hist(bins=500)

In [None]:
import numpy as np

df["log_values"] = np.log(df.price.values)
df.log_values.hist(bins=500)

In [None]:
pipelineModel.write().overwrite().save("temp/")

In [None]:
from pyspark.ml import PipelineModel

temp_model = PipelineModel.load("temp")
temp_model