In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://MSI:4040
SparkContext available as 'sc' (version = 3.0.0, master = local[*], app id = local-1605206710326)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7d5bdaa


In [2]:
val df = (spark.read.option("header","true").option("inferSchema","true")
          .option("multiline","true").format("csv")
          .load("../../data/ml_scala/Ecommerce Customers"))

df: org.apache.spark.sql.DataFrame = [Email: string, Address: string ... 6 more fields]


In [3]:
df.printSchema()
df.describe().show()
df.show(5)

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)

+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|summary|            Email|             Address|     Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+-------+-----------------+--------------------+-----------+------------------+------------------+------------------+--------------------+-------------------+
|  count|              500|                 500|        500|               500|               500|               500|                 500|                500|
|

In [4]:
// Print out one row of data to better visualize
val colnames = df.columns
val firstrow = df.head(1)(0)
println("\n")
println("Example Data Row")
for(ind <- Range(1,colnames.length)){
  println(colnames(ind))
  println(firstrow(ind))
  println("\n")
}



Example Data Row
Address
835 Frank Tunnel
Wrightmouth, MI 82180-9605


Avatar
Violet


Avg Session Length
34.49726772511229


Time on App
12.655651149166752


Time on Website
39.57766801952616


Length of Membership
4.082620632952961


Yearly Amount Spent
587.9510539684005




colnames: Array[String] = Array(Email, Address, Avatar, Avg Session Length, Time on App, Time on Website, Length of Membership, Yearly Amount Spent)
firstrow: org.apache.spark.sql.Row =
[mstephenson@fernandez.com,835 Frank Tunnel
Wrightmouth, MI 82180-9605,Violet,34.49726772511229,12.655651149166752,39.57766801952616,4.082620632952961,587.9510539684005]


In [5]:
df.columns

res2: Array[String] = Array(Email, Address, Avatar, Avg Session Length, Time on App, Time on Website, Length of Membership, Yearly Amount Spent)


In [6]:
val df2 = (df.select(df("Yearly Amount Spent").as("label"),
                     $"Avg Session Length", $"Time on App", 
                     $"Time on Website", $"Length of Membership"))

df2.show(5)

+------------------+------------------+------------------+------------------+--------------------+
|             label|Avg Session Length|       Time on App|   Time on Website|Length of Membership|
+------------------+------------------+------------------+------------------+--------------------+
| 587.9510539684005| 34.49726772511229|12.655651149166752| 39.57766801952616|   4.082620632952961|
| 392.2049334443264|31.926272026360156|11.109460728682564|37.268958868297744|    2.66403418213262|
|487.54750486747207|33.000914755642675|11.330278057777512| 37.11059744212085|   4.104543202376424|
| 581.8523440352178| 34.30555662975554|13.717513665142508| 36.72128267790313|  3.1201787827480914|
| 599.4060920457634| 33.33067252364639|12.795188551078114| 37.53665330059473|   4.446308318351435|
+------------------+------------------+------------------+------------------+--------------------+
only showing top 5 rows



df2: org.apache.spark.sql.DataFrame = [label: double, Avg Session Length: double ... 3 more fields]


In [7]:
val assembler = (new VectorAssembler()
                 .setInputCols(Array("Avg Session Length", "Time on App",
                                     "Time on Website", "Length of Membership"))
                 .setOutputCol("features"))

val output = assembler.transform(df2).select($"label",$"features")

output.show(5)

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
| 587.9510539684005|[34.4972677251122...|
| 392.2049334443264|[31.9262720263601...|
|487.54750486747207|[33.0009147556426...|
| 581.8523440352178|[34.3055566297555...|
| 599.4060920457634|[33.3306725236463...|
+------------------+--------------------+
only showing top 5 rows



assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_f9c5b31fc3ba, handleInvalid=error, numInputCols=4
output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [8]:
// Create model and fit data
val lr = new LinearRegression()

val lrModel = lr.fit(output)

val trainingSummary = lrModel.summary

// Show some residuals
trainingSummary.residuals.show(5)

// Show some prediction
trainingSummary.predictions.show(5)

+-------------------+
|          residuals|
+-------------------+
|-6.7882340900329154|
| 11.841128565334486|
| -17.65262700858898|
| 11.454889631168953|
|  7.783382437305136|
+-------------------+
only showing top 5 rows

+------------------+--------------------+------------------+
|             label|            features|        prediction|
+------------------+--------------------+------------------+
| 587.9510539684005|[34.4972677251122...| 594.7392880584334|
| 392.2049334443264|[31.9262720263601...| 380.3638048789919|
|487.54750486747207|[33.0009147556426...|505.20013187606105|
| 581.8523440352178|[34.3055566297555...| 570.3974544040489|
| 599.4060920457634|[33.3306725236463...| 591.6227096084583|
+------------------+--------------------+------------------+
only showing top 5 rows



lr: org.apache.spark.ml.regression.LinearRegression = linReg_8b8f8ca06978
lrModel: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_8b8f8ca06978, numFeatures=4
trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@59bf2548


In [9]:
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")

// Model Performance
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"MSE: ${trainingSummary.meanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

Coefficients: [25.73427108467836,38.70915381082902,0.4367388355862311,61.577323754875984] Intercept: -1051.5942552993702
numIterations: 1
objectiveHistory: List(0.0)
RMSE: 9.923256785022243
MSE: 98.47102522148998
r2: 0.9843155370226726


In [10]:
spark.stop()