In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://MSI:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1605304695636)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@57b6eb4b


In [2]:
val df_house = (spark.read.option("header","true").option("multiline","true").option("inferSchema","true").format("csv")
          .load("../../data/ml_scala/USA_Housing.csv"))

df_house: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 5 more fields]


In [3]:
df_house.printSchema()
df_house.describe().show()
df_house.show(5)

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)
 |-- Address: string (nullable = true)

+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+--------------------+
|summary|   Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|   Area Population|             Price|             Address|
+-------+------------------+------------------+------------------------+---------------------------+------------------+------------------+--------------------+
|  count|              5000|              5000|                    5000|                       5000|              5000|              5000|                5000|
|   mean| 68583.108

In [4]:
// Better visualization on what a record looks like
val colnames = df_house.columns
val firstrow = df_house.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Avg Area House Age
5.682861321615587


Avg Area Number of Rooms
7.009188142792237


Avg Area Number of Bedrooms
4.09


Area Population
23086.800502686456


Price
1059033.5578701235


Address
208 Michael Ferry Apt. 674
Laurabury, NE 37010-5101




colnames: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price, Address)
firstrow: org.apache.spark.sql.Row =
[79545.45857431678,5.682861321615587,7.009188142792237,4.09,23086.800502686456,1059033.5578701235,208 Michael Ferry Apt. 674
Laurabury, NE 37010-5101]


In [5]:
// Prep dataframe to include features and label matrix
val df = (df_house.select(df_house("Price").as("label"),
                            $"Avg Area Income", $"Avg Area House Age", $"Avg Area Number of Rooms",
                            $"Avg Area Number of Bedrooms", $"Area Population"))

val assembler = (new VectorAssembler()
                 .setInputCols(Array("Avg Area Income", "Avg Area House Age",
                                    "Avg Area Number of Rooms", "Avg Area Number of Bedrooms",
                                    "Area Population")).setOutputCol("features"))

val output = assembler.transform(df).select($"label",$"features")

output.show(5)

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|[79545.4585743167...|
|  1505890.91484695|[79248.6424548256...|
|1058987.9878760849|[61287.0671786567...|
|1260616.8066294468|[63345.2400462279...|
| 630943.4893385402|[59982.1972257080...|
+------------------+--------------------+
only showing top 5 rows



df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 4 more fields]
assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_f6e9234bc1cb, handleInvalid=error, numInputCols=5
output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [6]:
// Train test split
val Array(training, test) = output.select("label","features").randomSplit(Array(0.8,0.2))

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]


In [7]:
// Create basic regression model
val lr = new LinearRegression()

// Create hyper-paramter grid for tuning
val paramGrid = (new ParamGridBuilder().addGrid(lr.regParam, Array(0.1,0.01))
                 .addGrid(lr.fitIntercept).addGrid(lr.elasticNetParam, Array(0.0,0.5,0.01))
                 .build())

// Validation for hyperparameter tuning - similiar to CV but only trains once
val trainValidationSplit = (new TrainValidationSplit()
                            .setEstimator(lr)
                            .setEvaluator(new RegressionEvaluator().setMetricName("r2") )
                            .setEstimatorParamMaps(paramGrid)
                            .setTrainRatio(0.8) )

//  Train the model
val model = trainValidationSplit.fit(training)

// Show predictions against test labels
model.transform(test).select("features", "label", "prediction").show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[60167.6726073388...| 88591.77016003926|158079.80872222036|
|[48735.9245124086...| 151527.0826265551| 368883.0853910432|
|[47685.2575946853...|  294170.746352692|249798.80601892946|
|[49601.0616347867...| 302307.4010604978|367703.17610806273|
|[17796.6311895433...|302355.83597895555| 92036.40603054548|
|[59141.7964422585...| 313651.5032332925| 526641.9332406619|
|[47018.0671117179...|377618.96990141843|438257.60227354383|
|[51218.6782600275...| 385678.1666731234|393547.35729224375|
|[48829.1727051231...|412057.44010888686| 385186.5530622406|
|[66469.3694730564...| 412269.2033995612| 658565.0222779778|
|[48510.9101270763...|414165.22036082624|523686.46361347754|
|[58198.0323119737...|420122.99953232025| 245132.2733218181|
|[63787.5363538975...|433247.15658337076| 578652.7563742837|
|[46800.3725884912...|  

lr: org.apache.spark.ml.regression.LinearRegression = linReg_66d61659e810
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_66d61659e810-elasticNetParam: 0.0,
	linReg_66d61659e810-fitIntercept: true,
	linReg_66d61659e810-regParam: 0.1
}, {
	linReg_66d61659e810-elasticNetParam: 0.0,
	linReg_66d61659e810-fitIntercept: true,
	linReg_66d61659e810-regParam: 0.01
}, {
	linReg_66d61659e810-elasticNetParam: 0.0,
	linReg_66d61659e810-fitIntercept: false,
	linReg_66d61659e810-regParam: 0.1
}, {
	linReg_66d61659e810-elasticNetParam: 0.0,
	linReg_66d61659e810-fitIntercept: false,
	linReg_66d61659e810-regParam: 0.01
}, {
	linReg_66d61659e810-elasticNetParam: 0.5,
	linReg_66d61659e810-fitIntercept: true,
	linReg_66d61659e810-regParam: 0.1
}, {
	linReg_66d61...


In [8]:
// Model Performance
model.validationMetrics

res4: Array[Double] = Array(0.9184783661169755, 0.9184783711250816, 0.49917048882672466, 0.4991704770823169, 0.9184783625794555, 0.9184783706577778, 0.4963969247843364, 0.4969666689739859, 0.9184783660802863, 0.9184783712342668, 0.4969666761192497, 0.49696666531763534)


In [9]:
val prod_model = model.bestModel

prod_model: org.apache.spark.ml.Model[_] = LinearRegressionModel: uid=linReg_66d61659e810, numFeatures=5


In [10]:
spark.stop()