# Machine Learning with Spark

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession \
  .builder \
  .appName('MLIntro') \
  .getOrCreate()

In [26]:
# Load training data
train = spark.read.csv('housing.csv', header=True, inferSchema=True)

In [14]:
train.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_prox_num|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|           3.0|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|           3.0|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|           3.0|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|           3.0|
|  -122.25|   37.85|              52.0|  

`VectorAssembler` is used by PySpark to combine a list of columns into a single vector column. Usually used to combine raw features and features from transformations in order to train ML models 

`StringIndexer` is used to convert string data types into numerical

In [27]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Change the string column to a numeric one
string_indexer = StringIndexer(
  inputCol='ocean_proximity',
  outputCol='ocean_prox_num'
)
train = string_indexer.fit(train).transform(train)
train = train.drop('ocean_proximity')

# Combining columns into a single column
feature_assembler = VectorAssembler(
  inputCols=[
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'population',
    'households',
    'median_income',
    'ocean_prox_num'
  ],
  outputCol="Independent Variables"
)
train = feature_assembler.transform(train)
train.show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+---------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_prox_num|Independent Variables|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+--------------+---------------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|           3.0| [-122.23,37.88,41...|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|           3.0| [-122.22,37.86,21...|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|           3.0| [-122.24,37.85,52...|
|  -122.25|   37.85|              52.0|     12

In [37]:
training_data = train.select(['Independent Variables', 'median_house_value'])
training_data.show(5)

+---------------------+------------------+
|Independent Variables|median_house_value|
+---------------------+------------------+
| [-122.23,37.88,41...|          452600.0|
| [-122.22,37.86,21...|          358500.0|
| [-122.24,37.85,52...|          352100.0|
| [-122.25,37.85,52...|          341300.0|
| [-122.25,37.85,52...|          342200.0|
+---------------------+------------------+
only showing top 5 rows



In [38]:
# Splitting data into train and test sets
train_data, test_data = training_data.randomSplit([0.75, 0.25])

In [44]:
from pyspark.ml.regression import LinearRegression

# Fit model to Linear Regression
regressor = LinearRegression(
  featuresCol='Independent Variables',
  labelCol='median_house_value'
)
regression = regressor.fit(train_data)

In [47]:
# Details of ML results
regression.coefficients

DenseVector([-42106.9038, -42139.5922, 1150.0765, -2.1397, -44.1553, 153.6768, 38813.7272, -970.7164])

In [50]:
# Make predictions
pred_results = regression.evaluate(test_data)
pred_results.predictions.show(5)

+---------------------+------------------+------------------+
|Independent Variables|median_house_value|        prediction|
+---------------------+------------------+------------------+
| [-124.3,41.84,17....|          103600.0| 98878.21950594336|
| [-124.26,40.58,52...|          111400.0| 166869.0367788598|
| [-124.25,40.28,32...|           76100.0|134565.36655658018|
| [-124.23,41.75,11...|           73200.0| 69576.07339649973|
| [-124.19,41.78,15...|           74600.0| 51038.55300127342|
+---------------------+------------------+------------------+
only showing top 5 rows



In [52]:
# Display metrics - across test set
print(f"The Mean Absolute Error is {pred_results.meanAbsoluteError}")
print(f"The Mean Squared Error is {pred_results.meanSquaredError}")

The Mean Absolute Error is 51700.73401185502
The Mean Squared Error is 4984424611.6106825
