In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
import os

data_dir = '../../data/Spark_ML/Linear_Regression/'

In [None]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

data = spark.read.csv(os.path.join(data_dir, 'Ecommerce_Customers.csv'), inferSchema=True, header=True)

In [None]:
data.printSchema()

In [None]:
data.head(1)

## Transform csv data in MLlib data format

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
                            outputCol='features')

output = assembler.transform(data)

final_data = output.select('features', 'Yearly Amount Spent')

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

## Train and Evaluate the model

In [None]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)

test_results.residuals.show()

In [None]:
print(test_results.rootMeanSquaredError)
print(test_results.r2)

In [None]:
unlabelled_data = test_data.select('features')

predictions = lr_model.transform(unlabelled_data)

predictions.show()