In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
import os

data_dir = '../../data/Spark_ML/Linear_Regression/'

In [None]:
spark = SparkSession.builder.appName('lr_project').getOrCreate()

data = spark.read.csv(os.path.join(data_dir, 'cruise_ship_info.csv'), inferSchema=True, header=True)

data.printSchema()

In [None]:
from pyspark.ml.feature import StringIndexer

str_indexer = StringIndexer(inputCol="Cruise_line", outputCol="Cruise_category")
str_indexer_model = str_indexer.fit(data)
new_data = str_indexer_model.transform(data)

new_data.printSchema()

new_data.head(2)

## Transform csv data in MLlib data format

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins','Cruise_category'],
                            outputCol='features')

output = assembler.transform(new_data)

final_data = output.select('features', 'crew')

In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

## Train and evaluate

In [None]:
lr = LinearRegression(labelCol='crew')

lr_model = lr.fit(train_data)

In [None]:
test_results = lr_model.evaluate(test_data)

test_results.residuals.show()

In [None]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MAE: {}'.format(test_results.meanAbsoluteError))
print('R2: {}'.format(test_results.r2))

data.describe().select("summary", "crew").show()

In [None]:
unlabelled_data = test_data.select('features')

predictions = lr_model.transform(unlabelled_data)

predictions.show()