In [2]:
// Import LinearRegression
import org.apache.spark.ml.regression.LinearRegression

// Set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

// Start a simple Spark Session
import org.apache.spark.sql.SparkSession

// Import VectorAssembler and Vectors
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.regression.LinearRegression
import org.apache.log4j._
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


In [3]:
val spark = SparkSession.builder().getOrCreate()

// Use Spark to read in the ecommerce_customers csv file.
val data = spark.read
    .option("header","true")
    .option("inferSchema","true")
    .format("csv")
    .load("ecommerce_customers")
// Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6ef07d6
data: org.apache.spark.sql.DataFrame = [Email: string, Address: string ... 6 more fields]


In [4]:
val col_names = data.columns
val first_row = data.head(2)(1)
println("\n")
println("example data row")
for (ind <- Range(1, col_names.length)){
println(col_names(ind))
println(first_row(ind))
println("\n")}



example data row
Address
 MI 82180-9605&quot;


Avatar
Violet


Avg Session Length
34.49726772511229


Time on App
12.655651149166752


Time on Website
39.57766801952616


Length of Membership
4.082620632952961


Yearly Amount Spent
587.9510539684005




col_names: Array[String] = Array(Email, Address, Avatar, Avg Session Length, Time on App, Time on Website, Length of Membership, Yearly Amount Spent)
first_row: org.apache.spark.sql.Row = [Wrightmouth, MI 82180-9605&quot;,Violet,34.49726772511229,12.655651149166752,39.57766801952616,4.082620632952961,587.9510539684005]


In [5]:
data.head(2)

res2: Array[org.apache.spark.sql.Row] = Array([mstephenson@fernandez.com,835 Frank Tunnel,null,null,null,null,null,null], [Wrightmouth, MI 82180-9605&quot;,Violet,34.49726772511229,12.655651149166752,39.57766801952616,4.082620632952961,587.9510539684005])


In [6]:
// In PySpark, if your dataset is small (can fit into memory of driver), you can do
data.take(2)(0)

res3: org.apache.spark.sql.Row = [mstephenson@fernandez.com,835 Frank Tunnel,null,null,null,null,null,null]


In [7]:
data.columns

res4: Array[String] = Array(Email, Address, Avatar, Avg Session Length, Time on App, Time on Website, Length of Membership, Yearly Amount Spent)


In [8]:
// Rename the Yearly Amount Spent Column as "label"
// Also grab only the numerical columns from the data
// Set all of this as a new dataframe called df
val df = data.select(
    data("Yearly Amount Spent").as("label"),
//     $"Email",
//     $"Address",
//     $"Avatar",
    $"Avg Session Length",
    $"Time on App",
    $"Time on Website",
    $"Length of Membership")

val df_no_null = df.na.drop()

df: org.apache.spark.sql.DataFrame = [label: double, Avg Session Length: double ... 3 more fields]
df_no_null: org.apache.spark.sql.DataFrame = [label: double, Avg Session Length: double ... 3 more fields]


In [9]:
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)



In [10]:
// Set the input columns from which we are supposed to read the values.
// Call this new object assembler
// Use VectorAssembler to convert the input columns of df
// to a single output column of an array called "features"

val assembler = new VectorAssembler()
    .setInputCols(Array(
        "Avg Session Length",
        "Time on App",
        "Time on Website",
        "Length of Membership"))
    .setOutputCol("features")

// An assembler converts the input values to a vector
// A vector is what the ML algorithm reads to train a model

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_b8342e89bdea, handleInvalid=error, numInputCols=4


In [11]:
// Use the assembler to transform our DataFrame to the two columns: label and features
val output = assembler
    .transform(df_no_null)
    .select(
        $"label",
        $"features")

output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [12]:
output.show()

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
| 587.9510539684005|[34.4972677251122...|
| 392.2049334443264|[31.9262720263601...|
|487.54750486747207|[33.0009147556426...|
| 581.8523440352178|[34.3055566297555...|
| 599.4060920457634|[33.3306725236463...|
|  637.102447915074|[33.8710378793419...|
| 521.5721747578274|[32.0215955013870...|
| 570.2004089636195|[33.9877728956856...|
| 492.6060127179966|[33.9925727749537...|
|408.64035107262754|[29.5324289670579...|
| 573.4158673313865|[33.1903340437226...|
|470.45273330095546|[32.3879758531538...|
| 461.7807421962299|[30.7377203726281...|
|457.84769594494855|[32.1253868972878...|
| 407.7045475495441|[32.3388993230671...|
|452.31567548003545|[32.1878120459321...|
|  605.061038804892|[32.6178560628234...|
| 534.7057438060227|[32.9127851111597...|
| 700.9170916173961|[34.5075509985266...|
|423.17999168059777|[33.0293319535068...|
+------------------+--------------

In [13]:
// Create a Linear Regression Model object
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_6f4af6b46b81


In [14]:
// Fit the model to the data and call this model lrModel
val lrModel = lr.fit(output)

lrModel: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_6f4af6b46b81, numFeatures=4


In [15]:
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients}, Intercept: ${lrModel.intercept}")

Coefficients: [25.856618008426018,38.765463575921366,0.6400889131953854,61.646207880881704], Intercept: -1063.872486876065


In [16]:
// Summarize the model over the training set and print out some metrics!
// Use the .summary method off your model to create an object
// called trainingSummary
val trainingSummary = lrModel.summary
// Show the residuals, the RMSE, the MSE, and the R^2 Values.
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"MSE: ${trainingSummary.meanSquaredError}")
println(s"r2: ${trainingSummary.r2}")


// Great Job!

+-------------------+
|          residuals|
+-------------------+
| -7.772623457978966|
|  11.82555250437241|
|-17.879141870095964|
| 11.081305443736483|
|   7.32385327834254|
|-1.7670434550307164|
|  4.558311825657142|
| 10.520236692071762|
|-16.417217125194043|
| 10.585380731138741|
| 12.029113007876617|
|  9.946068850417475|
| 10.837290579785645|
| 20.531726174145206|
| -4.098013724803593|
| -4.499385360994438|
|  8.863806504862396|
|-0.6646742992182908|
|-3.0793993430667115|
|-15.022236568735764|
+-------------------+
only showing top 20 rows

RMSE: 9.925307424896866
MSE: 98.51172747871286
r2: 0.9839070379164451


trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@7b4bf261
