In [1]:
from pyspark.sql import SparkSession
spark = spark.builder.appName('Expenditure').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
data = spark.read.csv('/FileStore/tables/Ecommerce_Customers.csv',inferSchema=True,header=True)

In [4]:
data.show()

In [5]:
data.printSchema()

In [6]:
for item in data.head(1)[0]:
  print(item)

In [7]:
#setup dataframe for machine learning
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


In [8]:
data.columns

In [9]:
#convert numerical columns , categorical data will not work
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='features')

In [10]:
output = assembler.transform(data)

In [11]:
output.printSchema()

In [12]:
output.head(1)

In [13]:
final_data = output.select('features','Yearly Amount Spent')

In [14]:
final_data.show()

In [15]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [16]:
train_data.describe().show()

In [17]:
test_data.describe().show()

In [18]:
lr = LinearRegression(featuresCol='features',labelCol='Yearly Amount Spent',predictionCol='predictions')

In [19]:
lr_model = lr.fit(train_data)

In [20]:
test_results = lr_model.evaluate(test_data)

In [21]:
#residuals = difference between actual value and predicted value from the test data
test_results.residuals.show()

In [22]:
test_results.rootMeanSquaredError

In [23]:
test_results.r2

In [24]:
final_data.describe().show()

In [25]:
#deploy model
# we only have features pertaining to the data but do not have data regarding the expenditure in a year
unlabeled_data = test_data.select('features')
unlabeled_data.show()

In [26]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()