In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [4]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [5]:
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True, header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
for item in data.head(2)[1]:
    print(item)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


### Setting up the DataFrame for Machine Learning

In [10]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [11]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [12]:
#Create an assembles
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
                            outputCol = 'features')

In [13]:
# Transform the data
output = assembler.transform(data)

In [14]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [16]:
# Select my final data
final_data = output.select('features','Yearly Amount Spent')

In [17]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [18]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [19]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                356|
|   mean| 496.40421118421193|
| stddev|    80.278630852135|
|    min| 256.67058229005585|
|    max|  725.5848140556806|
+-------+-------------------+



In [20]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                144|
|   mean|  506.5077774146937|
| stddev|  76.67962549771605|
|    min| 298.76200786180766|
|    max|  765.5184619388373|
+-------+-------------------+



In [21]:
# Create our linear regression model
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [22]:
lr_model = lr.fit(train_data)

### See how well our model perform
In order to do that we need to evaluate it againts some test data.

In [23]:
test_results = lr_model.evaluate(test_data)

In [25]:
# The residuals is just a difference between the predictive value and the actual value from the test data
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -2.913301673078365|
| 12.979241716608158|
| 1.2355801759314318|
|   4.70665469725202|
|0.44342517202721865|
|-3.6460846282450348|
| 3.4220298779197833|
|  5.284577674447803|
|-17.385298549858362|
| 18.723096558700036|
|  18.84076177432877|
|-0.3433107292131581|
| -8.555888820183213|
|0.15004551054386184|
|-12.257761111968875|
|  9.302838768159575|
| -8.400034346136977|
|-16.976751280389465|
|  8.104227894420433|
| -4.802788351861295|
+-------------------+
only showing top 20 rows



### Get some of those regression evalutaion metrics

In [26]:
# Root Mean Squared Error
test_results.rootMeanSquaredError

11.069953156370984

In [27]:
# r2
test_results.r2

0.9790126282805428

That 0.979 indicates that we are explaining a lot of the variance.

This is a very good model just from the root mean squared error RMSE and the residual because if you look at the actual original data:

In [28]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



The root mean square error `is the difference between your test value, that true value versus the predictive value`

We can see from the final_data, that the average yearly amount spend is 500 with a standard deviation of 79.31 dolars, with a minimum of 256.6 and a maximum of 765.51 dolars.

So we can see in comparison being off on that root mean squared error by 10 and a half dollars versus the actual average being 500 and the standard deviation being 80 is actually pretty good 

To confirm that `I have a good fitting model` I can also check out the R2 values where the R2 value is 0.979 wich is saying my model explains 98% of the variance in the data.

Now having such a good root mean squared error and also having such a good R2 value to kind of ping's something in the back of your mind saying okay I should double check this data, double check the way I fitted my model, make sure that you don't accidentally fit your model and the training data.

### How to deploy this model on some data. When we only have the features for.

Let's say we have some customers where we only have their features but we dont know how much they are actually going to spend in a year, to mimic that:

In [29]:
unlabeled_data = test_data.select('features')

In [30]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.8794843441274...|
|[31.1695067987115...|
|[31.2606468698795...|
|[31.3584771924370...|
|[31.3895854806643...|
|[31.4252268808548...|
|[31.4459724827577...|
|[31.5316044825729...|
|[31.5702008293202...|
|[31.6005122003032...|
|[31.6098395733896...|
|[31.6610498227460...|
|[31.7207699002873...|
|[31.7216523605090...|
|[31.8093003166791...|
|[31.8512531286083...|
|[31.8648325480987...|
|[31.9048571310136...|
|[31.9549038566348...|
|[31.9673209478824...|
+--------------------+
only showing top 20 rows



In [31]:
predictions_un = lr_model.transform(unlabeled_data)

In [32]:
predictions_un.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.8794843441274...|493.11990165793304|
|[31.1695067987115...|414.37728908568465|
|[31.2606468698795...|420.09105108101994|
|[31.3584771924370...| 490.4692957522234|
|[31.3895854806643...|409.62618588795567|
|[31.4252268808548...|  534.412803283007|
|[31.4459724827577...| 481.4549350572088|
|[31.5316044825729...|431.23102805491476|
|[31.5702008293202...| 563.3307906912632|
|[31.6005122003032...| 460.4497549323969|
|[31.6098395733896...| 425.7047878767794|
|[31.6610498227460...|  416.701664309114|
|[31.7207699002873...| 547.3308222982062|
|[31.7216523605090...| 347.6268811213288|
|[31.8093003166791...|   549.02966047481|
|[31.8512531286083...| 463.6894078986388|
|[31.8648325480987...|448.29131482295065|
|[31.9048571310136...| 490.9266087032056|
|[31.9549038566348...|431.89365204550654|
|[31.9673209478824...|450.55262959151355|
+--------------------+------------