In [1]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

// See less warnings
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

// Start a simple Spark Session
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.22:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1586260261553)
SparkSession available as 'spark'


import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.log4j._
import org.apache.spark.sql.SparkSession


In [2]:
val spark = SparkSession.builder().getOrCreate()

// Prepare training and test data.
val data = spark.read
  .option("header","true")
  .option("inferSchema","true")
  .format("csv")
  .load("usa_housing.csv")

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@60986f43
data: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 4 more fields]


In [3]:
// Check out the Data
data.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



In [4]:
// See an example of what the data looks like
// by printing out a Row
val colnames = data.columns
val firstrow = data.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Avg Area House Age
5.682861321615587


Avg Area Number of Rooms
7.009188142792237


Avg Area Number of Bedrooms
4.09


Area Population
23086.800502686456


Price
1059033.5578701235




colnames: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price)
firstrow: org.apache.spark.sql.Row = [79545.45857431678,5.682861321615587,7.009188142792237,4.09,23086.800502686456,1059033.5578701235]


In [5]:
// This will allow us to join multiple feature columns
// into a single column of an array of feautre values
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


In [6]:
// Rename Price to label column for naming convention.
// Grab only numerical columns from the data
val df = data.select(
    data("Price").as("label"),
    $"Avg Area Income",
    $"Avg Area House Age",
    $"Avg Area Number of Rooms",
    $"Area Population")

df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 3 more fields]


In [7]:
// An assembler converts the input values to a vector
// A vector is what the ML algorithm reads to train a model

// Set the input columns from which we are supposed to read the values
// Set the name of the column where the vector will be stored
val assembler = new VectorAssembler()
    .setInputCols(Array(
        "Avg Area Income",
        "Avg Area House Age",
        "Avg Area Number of Rooms",
        "Area Population"))
    .setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_acc092b68fdc


In [8]:
// Use the assembler to transform our DataFrame to the two columns
val output = assembler.transform(df).select($"label",$"features")

output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [9]:
output.show()

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|[79545.4585743167...|
|  1505890.91484695|[79248.6424548256...|
|1058987.9878760849|[61287.0671786567...|
|1260616.8066294468|[63345.2400462279...|
| 630943.4893385402|[59982.1972257080...|
|1068138.0743935304|[80175.7541594853...|
|1502055.8173744078|[64698.4634278877...|
|1573936.5644777215|[78394.3392775308...|
| 798869.5328331633|[59927.6608133496...|
|1545154.8126419624|[81885.9271840956...|
| 1707045.722158058|[80527.4720829228...|
| 663732.3968963273|[50593.6954970428...|
|1042814.0978200928|[39033.8092369823...|
|1291331.5184858206|[73163.6634410467...|
|1402818.2101658515|[69391.3801843616...|
|1306674.6599511993|[73091.8667458232...|
|1556786.6001947748|[79706.9630576574...|
| 528485.2467305964|[61929.0770180892...|
|1019425.9367578316|[63508.1942994299...|
|1030591.4292116085|[62085.2764034048...|
+------------------+--------------

In [13]:
// Create a Linear Regression Model object
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_f989e16d785f


In [14]:
// Fit the model to the data

// Note: Later we will see why we should split
// the data first, but for now we will fit to all the data.
val lrModel = lr.fit(output)

lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_f989e16d785f


In [15]:
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: [21.58274357311781,165657.8724329605,121598.16461647583,15.196119819750825] Intercept: -2637560.67254761


In [16]:
// Summarize the model over the training set and print out some metrics!
// Explore this in the spark-shell for more methods to call
val trainingSummary = lrModel.summary

println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")

trainingSummary.residuals.show()

println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"MSE: ${trainingSummary.meanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

numIterations: 1
objectiveHistory: List(0.0)
+-------------------+
|          residuals|
+-------------------+
|-164759.92057681922|
|  9690.547545112902|
| -193522.9705765734|
|  139506.9957614462|
| -214819.1296369878|
|  147.1563499146141|
|-170004.15564831533|
| 1675.3336070652585|
|  30782.38727523212|
|  79276.89880586648|
| -64967.26304096077|
| 34514.368293934385|
|   89443.8314024884|
|  -16448.1027878148|
|  95327.46257374831|
|  65660.76816228265|
| 31450.784453772707|
| 42636.212100361125|
|-115709.42082964187|
|-167589.65318883688|
+-------------------+
only showing top 20 rows

RMSE: 101108.8122249438
MSE: 1.0222991909538944E10
r2: 0.9179976891524392


trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@69fcaef6
