In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyspark

In [3]:
#Importing Linear regression using pyspark
from pyspark.ml.regression import LinearRegression

In [4]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Customers').getOrCreate()

In [5]:
#Reading the csv
dataset=spark.read.csv("e-commerce.csv",inferSchema=True,header=True)

In [6]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [7]:
dataset.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [8]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
featureassembler=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="Independent Features")

In [11]:
output=featureassembler.transform(dataset)

In [12]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [13]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [14]:
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features']

In [15]:
finalized_data=output.select("Email","Independent Features","Yearly Amount Spent")

In [16]:
finalized_data.show()

+--------------------+--------------------+-------------------+
|               Email|Independent Features|Yearly Amount Spent|
+--------------------+--------------------+-------------------+
|mstephenson@ferna...|[34.49726773,12.6...|         587.951054|
|   hduke@hotmail.com|[31.92627203,11.1...|        392.2049334|
|    pallen@yahoo.com|[33.00091476,11.3...|        487.5475049|
|riverarebecca@gma...|[34.30555663,13.7...|         581.852344|
|mstephens@davidso...|[33.33067252,12.7...|         599.406092|
|alvareznancy@luca...|[33.87103788,12.0...|        637.1024479|
|katherine20@yahoo...|[32.0215955,11.36...|        521.5721748|
|  awatkins@yahoo.com|[32.73914294,12.3...|        549.9041461|
|vchurch@walter-ma...|[33.9877729,13.38...|         570.200409|
|    bonnie69@lin.biz|[31.93654862,11.8...|        427.1993849|
|andrew06@peterson...|[33.99257277,13.3...|        492.6060127|
|ryanwerner@freema...|[33.87936082,11.5...|        522.3374046|
|   knelson@gmail.com|[29.53242897,10.9.

In [17]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [18]:
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')
regressor=regressor.fit(train_data)

In [19]:
regressor.coefficients

DenseVector([26.0568, 38.6429, 0.5347, 61.2608])

In [20]:
regressor.intercept

-1063.8242821976241

In [21]:
pred_results=regressor.evaluate(test_data)

In [22]:
pred_results.predictions.show(40)

+--------------------+--------------------+-------------------+------------------+
|               Email|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+--------------------+-------------------+------------------+
|   aaron89@gmail.com|[31.44744649,10.1...|        418.6027421| 425.9329544606769|
|alejandro75@hotma...|[32.18781205,14.7...|        452.3156755|456.88574416772644|
|alvaradoadam@jone...|[31.95490386,10.9...|        439.9978799|431.65210228731075|
|  amanda03@yahoo.com|[34.18818406,13.1...|         583.977802| 585.5464481093604|
|amandastanley@yah...|[32.09610899,10.8...|        375.3984554|375.38824047493176|
| angela25@walker.com|[34.50141785,12.4...|         584.105885| 581.8064784744436|
|annstone@hotmail.com|[34.60624245,11.7...|        402.1671222|424.34516317342286|
| arice@reynolds.info|[33.56647439,12.2...|        466.4211988|   458.67701727545|
|aschmidt@johnson.biz|[32.21292383,11.7...|        513.1531119| 513.3628582958729|
| ba

In [23]:
# Import the necessary evaluation module from PySpark
from pyspark.ml.evaluation import RegressionEvaluator

# Create an instance of the RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="Yearly Amount Spent",  # The name of the target column
    predictionCol="prediction",  # The name of the prediction column
    metricName="rmse"  # Choose the evaluation metric (e.g., RMSE)
)

# Use the evaluator to calculate the RMSE on the test data
rmse = evaluator.evaluate(pred_results.predictions)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate and print other metrics (e.g., R-squared)
r2 = evaluator.setMetricName("r2").evaluate(pred_results.predictions)
print("R-squared (R2):", r2)

Root Mean Squared Error (RMSE): 9.937363775412575
R-squared (R2): 0.9858223151933903


In [25]:
feature_importance = regressor.coefficients
print("Feature Importance:", feature_importance)

Feature Importance: [26.056759626690894,38.6428635875233,0.5346866475961004,61.26078730739352]
