In [79]:
import findspark
findspark.init('/home/nick/spark-3.0.1-bin-hadoop2.7')

In [80]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import format_number

In [81]:
import numpy as np
np.set_printoptions(suppress=True)

In [82]:
spark = SparkSession.builder.appName("E Commerce - Customer Data").getOrCreate()

In [83]:
data = spark.read.csv('Linear_Regression/Ecommerce_Customers.csv', header=True, inferSchema=True)

In [84]:
data.show(3)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|   Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
+--------------------+--------------------+---------+------------------+----------------

In [85]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [86]:
for item in data.head(2)[1]:
    print(item)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [87]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [88]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [89]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'
                             ],
                           outputCol='features')

In [90]:
output = assembler.transform(data)

In [91]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [92]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [93]:
final_data = output.select('features','Yearly Amount Spent')

In [94]:
final_data.show(5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



In [95]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [96]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                365|
|   mean| 499.35078009144166|
| stddev|  76.95461369963719|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [97]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                135|
|   mean| 499.21469922903105|
| stddev|  85.67171162986604|
|    min| 256.67058229005585|
|    max|  725.5848140556806|
+-------+-------------------+



In [98]:
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent')

In [99]:
lr_model = lr.fit(train_data)

In [100]:
test_results = lr_model.evaluate(test_data)

In [101]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|  11.48060819116563|
| 12.123122282175586|
|  7.909290009914457|
| 0.5384559456755369|
|-2.5721265191468774|
| 23.505117022231843|
|  4.025628236027956|
|  19.30124050510659|
| -2.733542636397715|
| -8.482851720372935|
|-0.9730851054615641|
|  18.31649733281148|
|-1.0218251313723954|
| 3.0090319682427094|
|  9.364738276893206|
| -8.630960243731977|
| -4.335822782811761|
|  -7.90609277902422|
|  17.81691464652215|
| 1.3017699829117646|
+-------------------+
only showing top 20 rows



In [102]:
test_results.rootMeanSquaredError

9.826585627083926

In [103]:
test_results.r2 # Explaning alot of the variance in this model.

0.9867456215000625

In [104]:
final_data.describe().show() # RMSE 9$ with average of 500$ is good

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [105]:
unlabeled_data = test_data.select('features')

In [106]:
unlabeled_data.head(1)[0]

Row(features=DenseVector([29.5324, 10.9613, 37.4202, 4.0464]))

In [107]:
predictions = lr_model.transform(unlabeled_data)

In [108]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...|397.15974288146185|
|[30.7377203726281...| 449.6576199140543|
|[30.9716756438877...|486.72931974697826|
|[31.2606468698795...|420.78817531127584|
|[31.2681042107507...| 426.0426596929708|
|[31.2834474760581...| 568.2759724034356|
|[31.3091926408918...|428.69508960390567|
|[31.3123495994443...|444.29017752283403|
|[31.4252268808548...| 533.5002612911596|
|[31.5261978982398...|417.57737791271074|
|[31.5761319713222...| 542.1996690947899|
|[31.6098395733896...| 426.2290523182967|
|[31.7216523605090...|348.79875176324504|
|[31.7366356860502...| 493.9244142872892|
|[31.8512531286083...| 463.6275083899052|
|[31.8854062999117...| 398.7342332162075|
|[31.9673209478824...|  450.085664022464|
|[32.0085045178551...| 451.1033138077796|
|[32.0609143984100...| 609.7864040664929|
|[32.0705462209254...| 531.4500175989247|
+--------------------+------------

In [109]:
spark.stop()