In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('/FileStore/tables/cruise_ship_info.csv', inferSchema=True, header=True)

In [5]:
data.printSchema()

In [6]:
data.show()

In [7]:
data.groupby('Cruise_line').count().show()

In [8]:
from pyspark.ml.feature import StringIndexer

In [9]:
indexer = StringIndexer(inputCol='Cruise_line',
                       outputCol='Cruise_cat')

In [10]:
indexed = indexer.fit(data).transform(data)

In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [12]:
indexed.columns

In [13]:
assembler = VectorAssembler(inputCols=[
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'Cruise_cat'
], outputCol='features')

In [14]:
output = assembler.transform(indexed)

In [15]:
output.select('features','crew').show()

In [16]:
final_data = output.select(['features','crew'])

In [17]:
final_data.show()

In [18]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [19]:
train_data.describe().show()

In [20]:
test_data.describe().show()

In [21]:
lr = LinearRegression(labelCol='crew')

In [22]:
lr_model = lr.fit(train_data)

In [23]:
test_results = lr_model.evaluate(test_data)

In [24]:
test_results.residuals.show()  #predicted - actual data

In [25]:
test_results.rootMeanSquaredError

In [26]:
test_results.r2


In [27]:
 final_data.describe().show()

In [28]:
unlabeled_data = test_data.select('features')

In [29]:
unlabeled_data.show()

In [30]:
predictions = lr_model.transform(unlabeled_data)

In [31]:
predictions.show()

In [32]:
from pyspark.sql.functions import corr

In [33]:
data.select(corr('crew','passengers')).show()

In [34]:
data.select(corr('crew','cabins')).show()