# Setup Environment

In [None]:
# install Java8
!apt-get -q install openjdk-8-jdk-headless -qq > /dev/null

# download spark3.1.1
!wget -q https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz

# unzip it
!tar xf spark-3.1.1-bin-hadoop2.7.tgz

# install findspark 
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
#spark = SparkSession.builder.appName('lr').getOrCreate()

# Download and Read the Data

In [None]:
!wget -q https://raw.githubusercontent.com/muhammetsnts/SPARK/main/data/Ecommerce_Customers.csv

In [None]:
data = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True, header=True)

In [None]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [None]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [None]:
for item in data.head(1)[0]:
  print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

We will work with the numerical data. Also we want to predict the 'Yearly Amount Spent' column. We will use the `VectorAssembler` to get a one single vector to predict it. We will name the output column as `features`.

In [None]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [None]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
                                       'Time on App',
                                      'Time on Website',
                                       'Length of Membership'],
                            outputCol='features')

We will transform all our data before train-test split.

In [None]:
# transform data

output = assembler.transform(data)

In [None]:
# new features column

output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [None]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [None]:
final_data = output.select("features", "Yearly Amount Spent")
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

## Train-Test Split

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [None]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                360|
|   mean| 497.07698395042723|
| stddev|  82.10089106218331|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [None]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                140|
|   mean|  505.0664636224398|
| stddev|  71.61208908607843|
|    min|  304.1355915788555|
|    max|  700.9170916173961|
+-------+-------------------+



## Create LR Model

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [None]:
lr_model = lr.fit(train_data)

## Evaluate Model

In [None]:
test_results = lr_model.evaluate(test_data)

In [None]:
# residuals shows the diference between predicted value and the actual label from the test_data

test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  0.274132954258846|
|-6.3856027805509825|
| -22.07052082488792|
|   -4.6967026202135|
| 19.739400513402472|
|  3.533434427863426|
| 0.8103254793752512|
| -7.742201099352428|
| 16.712083423533215|
|-26.411709143245275|
|  1.130990120183526|
| -11.98225544959223|
|  -3.41307405005972|
|  -2.38501149444528|
| -5.579456031794621|
|  5.792247742635595|
|-3.5215118449870033|
| -9.347277947264047|
|  5.870965793483151|
|  5.582262602704418|
+-------------------+
only showing top 20 rows



## Regression Evaluation Metrics

In [None]:
# rootMeanSquaredError is about 10$

test_results.rootMeanSquaredError

10.049945736306762

In [None]:
test_results.r2

0.9801633658989495

In [None]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



## Revealing The Results

As you can see in the results the values of rootMeanSquaredError and r2 values are meaningless by themselves. When we compare the results with our main data, we can see that 10\$ error is pretty good, because the mean of the YAS col is 499\$. 

Also r2 value says, our model explains %98 percent of the variance in the data.

## Deploying Model

Lets assume we have some customer features and we don't have how much they can spent in a year and try to predict that.

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.5743636841713...|
|[31.0613251567161...|
|[31.1239743499119...|
|[31.2681042107507...|
|[31.3123495994443...|
|[31.3662121671876...|
|[31.3895854806643...|
|[31.4474464941278...|
|[31.6098395733896...|
|[31.6739155032749...|
|[31.7366356860502...|
|[31.8093003166791...|
|[31.8186165667690...|
|[31.8530748017465...|
|[31.9453957483445...|
|[31.9480174211613...|
|[31.9673209478824...|
|[32.0085045178551...|
|[32.0305497162129...|
|[32.0478009788678...|
+--------------------+
only showing top 20 rows



In [None]:
predictions = lr_model.transform(unlabeled_data)

In [None]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.5743636841713...| 441.7902808038068|
|[31.0613251567161...| 493.9410608384526|
|[31.1239743499119...| 509.0175746646537|
|[31.2681042107507...| 428.1672357940374|
|[31.3123495994443...|443.85201751453815|
|[31.3662121671876...| 427.0554481286215|
|[31.3895854806643...|409.25928558060764|
|[31.4474464941278...|426.34494319457644|
|[31.6098395733896...|427.83346622757495|
|[31.6739155032749...| 502.1367770531265|
|[31.7366356860502...| 495.8024561353484|
|[31.8093003166791...| 548.7541548124334|
|[31.8186165667690...|449.83174742019537|
|[31.8530748017465...|461.67013495679726|
|[31.9453957483445...| 662.5993799694465|
|[31.9480174211613...| 456.1286291502622|
|[31.9673209478824...|449.27135308463926|
|[32.0085045178551...|452.54449897601944|
|[32.0305497162129...| 588.4035176251286|
|[32.0478009788678...|507.86830858339204|
+--------------------+------------

We predicted the first customer will spent 441\$, and the second one 493\$.

In [None]:
pred_results = predictions.join(test_data, predictions.features==test_data.features, how='inner').drop('features')
pred_results.show()

+------------------+-------------------+
|        prediction|Yearly Amount Spent|
+------------------+-------------------+
| 441.7902808038068| 442.06441375806565|
| 493.9410608384526|  487.5554580579016|
| 509.0175746646537|  486.9470538397658|
| 428.1672357940374|  423.4705331738239|
|443.85201751453815|  463.5914180279406|
| 427.0554481286215|  430.5888825564849|
|409.25928558060764|  410.0696110599829|
|426.34494319457644|   418.602742095224|
|427.83346622757495| 444.54554965110816|
| 502.1367770531265|  475.7250679098812|
| 495.8024561353484|  496.9334462555319|
| 548.7541548124334|  536.7718993628412|
|449.83174742019537| 446.41867337013565|
|461.67013495679726|   459.285123462352|
| 662.5993799694465|  657.0199239376519|
| 456.1286291502622|  461.9208768928978|
|449.27135308463926| 445.74984123965226|
|452.54449897601944|  443.1972210287554|
| 588.4035176251286|  594.2744834186118|
|507.86830858339204|  513.4505711860965|
+------------------+-------------------+
only showing top

In [None]:
pred_results = pred_results.withColumn('difference', pred_results['prediction']-pred_results['Yearly Amount Spent'])
pred_results.show()

+------------------+-------------------+-------------------+
|        prediction|Yearly Amount Spent|         difference|
+------------------+-------------------+-------------------+
| 441.7902808038068| 442.06441375806565| -0.274132954258846|
| 493.9410608384526|  487.5554580579016| 6.3856027805509825|
| 509.0175746646537|  486.9470538397658|  22.07052082488792|
| 428.1672357940374|  423.4705331738239|    4.6967026202135|
|443.85201751453815|  463.5914180279406|-19.739400513402472|
| 427.0554481286215|  430.5888825564849| -3.533434427863426|
|409.25928558060764|  410.0696110599829|-0.8103254793752512|
|426.34494319457644|   418.602742095224|  7.742201099352428|
|427.83346622757495| 444.54554965110816|-16.712083423533215|
| 502.1367770531265|  475.7250679098812| 26.411709143245275|
| 495.8024561353484|  496.9334462555319| -1.130990120183526|
| 548.7541548124334|  536.7718993628412|  11.98225544959223|
|449.83174742019537| 446.41867337013565|   3.41307405005972|
|461.67013495679726|   4

In [None]:
pred_results.describe().show()

+-------+------------------+-------------------+-------------------+
|summary|        prediction|Yearly Amount Spent|         difference|
+-------+------------------+-------------------+-------------------+
|  count|               140|                140|                140|
|   mean| 507.0939265007919|  505.0664636224398| 2.0274628783518613|
| stddev| 70.58574827888118|  71.61208908607843|  9.878656777137072|
|    min|315.16673276205256|  304.1355915788555|-24.256367823362552|
|    max| 703.7530108495523|  700.9170916173961|  27.37718634776138|
+-------+------------------+-------------------+-------------------+

