# Start the Spark session

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.1.tar.gz (212.3 MB)
Collecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767612 sha256=dad39467a2c129f384a344a1c23a096401da792d148cac9a6224bf9726d7bcab
  Stored in directory: c:\users\psubr\appdata\local\pip\cache\wheels\b3\0e\81\264aeed961e43b9f6ba9ec81c8c540d2d7dccc52c6b51cbf22
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName('Customer').getOrCreate()


In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('ecommerse.csv',inferSchema = True,header = True)

In [5]:
data

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

# Display Dataframe

In [6]:
data.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

# Display all the columns

In [7]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
featureassembler=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="Independent Features")

In [11]:
output=featureassembler.transform(data)

In [13]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [14]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [15]:

output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features']

In [16]:
All_data = output.select("Independent Features","Yearly Amount Spent")



In [18]:
All_data.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [21]:
train_data,test_data=All_data.randomSplit([0.75,0.25])

In [32]:
train_data.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[29.53242897,10.9...|        408.6403511|
|[30.39318454,11.8...|        319.9288698|
|[30.57436368,11.3...|        442.0644138|
|[30.73772037,12.6...|        461.7807422|
|[30.87948434,13.2...|           490.2066|
|[31.04722214,11.1...|        392.4973992|
|[31.06621816,11.7...|        448.9332932|
|[31.12397435,12.3...|        486.9470538|
|[31.26064687,13.2...|        421.3266313|
|[31.26810421,12.1...|        423.4705332|
|[31.30919264,11.9...|        432.7207178|
|[31.3123496,11.68...|         463.591418|
|[31.36621217,11.1...|        430.5888826|
|[31.42522688,13.2...|        530.7667187|
|[31.44597248,12.8...|        484.8769649|
|[31.44744649,10.1...|        418.6027421|
|[31.51473786,12.5...|         489.812488|
|[31.52575242,11.3...|        443.9656268|
|[31.5261979,12.04...|        409.0945262|
|[31.57413802,12.9...|        544.4092722|
+----------

# Fit the trainning data with linear regression model

In [25]:

regressor=LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')
regressor=regressor.fit(train_data)

In [26]:
regressor

LinearRegressionModel: uid=LinearRegression_b5fb50ba107e, numFeatures=4

In [27]:
regressor.coefficients

DenseVector([25.7202, 38.4509, 0.7253, 61.6194])

In [28]:
regressor.intercept

-1059.0800709288076

In [29]:
pred_results=regressor.evaluate(test_data)

In [30]:
pred_results

<pyspark.ml.regression.LinearRegressionSummary at 0x1d909ed9c10>

# Display predicted result

In [31]:

pred_results.predictions.show(40)

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.4925367,11.56...|        282.4712457|287.19000827988657|
|[30.81620065,11.8...|        266.0863409|282.82947369819635|
|[30.83643267,13.1...|        467.5019004|  470.935044533931|
|[30.97167564,11.7...|        494.6386098| 487.5143245803199|
|[31.06132516,12.3...|        487.5554581|493.19950601119467|
|[31.12809005,13.2...|        557.2526867| 564.3017826786438|
|[31.1695068,13.97...|        427.3565308| 416.3733897090847|
|[31.28344748,12.7...|        591.7810894| 569.0485031490464|
|[31.35847719,12.8...|        495.1759504|490.68067877307226|
|[31.38958548,10.9...|        410.0696111| 409.9043030327905|
|[31.5171218,10.74...|        275.9184207| 280.8878562182533|
|[31.53160448,13.3...|        436.5156057|432.77582929985533|
|[31.57020083,13.3...|        545.9454921|  562.904382526194|
|[31.625