In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://MSI:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1605152316365)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7d5bdaa


In [2]:
val housing_df = spark.read.option("header","true").option("inferSchema","true").option("multiline", "true").format("csv").load("../../data/ml_scala/USA_Housing.csv")

housing_df: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 5 more fields]


In [3]:
housing_df.printSchema()
housing_df.describe().show()
housing_df.show(5)

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)

+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+--------------------+
|summary|   Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|   Area Population|             Price|             Address|
+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+--------------------+
|  count|              5000|              5000|                    5000|                       5000|              5000|              5000|                5000|
|   mean| 68583.108

In [4]:
// Cleaner way of printing out one row of data
val colnames = housing_df.columns
val firstrow = housing_df.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Avg Area House Age
5.682861321615587


Avg Area Number of Rooms
7.009188142792237


Avg Area Number of Bedrooms
4.09


Area Population
23086.800502686456


Price
1059033.5578701235


Address
208 Michael Ferry Apt. 674
Laurabury, NE 37010-5101




colnames: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price, Address)
firstrow: org.apache.spark.sql.Row =
[79545.45857431678,5.682861321615587,7.009188142792237,4.09,23086.800502686456,1059033.5578701235,208 Michael Ferry Apt. 674
Laurabury, NE 37010-5101]


In [5]:
housing_df.columns

res2: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price, Address)


In [6]:
// Prep dataframe to include features and label
val df = (housing_df.select(housing_df("Price").as("label"),
                            $"Avg Area Income", $"Avg Area House Age", $"Avg Area Number of Rooms",
                            $"Avg Area Number of Bedrooms", $"Area Population"))

df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 4 more fields]


In [7]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)



In [8]:
val assembler = (new VectorAssembler()
                 .setInputCols(Array("Avg Area Income", "Avg Area House Age",
                                    "Avg Area Number of Rooms", "Avg Area Number of Bedrooms",
                                    "Area Population")).setOutputCol("features"))

val output = assembler.transform(df).select($"label",$"features")

output.show(5)

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|[79545.4585743167...|
|  1505890.91484695|[79248.6424548256...|
|1058987.9878760849|[61287.0671786567...|
|1260616.8066294468|[63345.2400462279...|
| 630943.4893385402|[59982.1972257080...|
+------------------+--------------------+
only showing top 5 rows



assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_5905608f201e, handleInvalid=error, numInputCols=5
output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [9]:
// Create model and fit data
val lr = new LinearRegression()

val lrModel = lr.fit(output)

val trainingSummary = lrModel.summary

trainingSummary.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|-164813.48488342203|
| 10953.223229608731|
|-194028.75820535584|
| 139392.73897870723|
|-214445.27695671446|
+-------------------+
only showing top 5 rows



lr: org.apache.spark.ml.regression.LinearRegression = linReg_37170cd6be88
lrModel: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_37170cd6be88, numFeatures=5
trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@319756bd


In [10]:
// Examine Predictions
trainingSummary.predictions.show(5)

+------------------+--------------------+------------------+
|             label|            features|        prediction|
+------------------+--------------------+------------------+
|1059033.5578701235|[79545.4585743167...|1223847.0427535456|
|  1505890.91484695|[79248.6424548256...|1494937.6916173412|
|1058987.9878760849|[61287.0671786567...|1253016.7460814407|
|1260616.8066294468|[63345.2400462279...|1121224.0676507396|
| 630943.4893385402|[59982.1972257080...| 845388.7662952547|
+------------------+--------------------+------------------+
only showing top 5 rows



In [11]:
// Model Coefficients
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")

Coefficients: [21.578049448352026,165637.02694091276,120659.94881629614,1651.1390539904344,15.200743923741493] Intercept: -2637299.033328577
numIterations: 1
objectiveHistory: List(0.0)


In [12]:
// Model Performance
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"MSE: ${trainingSummary.meanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

RMSE: 101092.70158252295
MSE: 1.0219734313253036E10
r2: 0.9180238195089548
