In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('consult_ship').getOrCreate()

## Import cruise ship data

In [3]:
ship_info = spark.read.csv('cruise_ship_info.csv',inferSchema=True,header=True)

In [4]:
ship_info.count()

158

In [5]:
ship_info.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



## Create data set with necessary features

In [6]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='Cruise_line',outputCol='CruiseLineInd')
ship_info_CruiseLineInd = indexer.fit(ship_info).transform(ship_info)

In [7]:
ship_info_CruiseLineInd.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|CruiseLineInd|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|         16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|         16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|          1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|          1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|          1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------+
o

In [8]:
ship_info.select('Cruise_line').distinct().count()

20

## Group necessary features into single column 

In [9]:
from pyspark.ml.feature import VectorAssembler

cols = ['CruiseLineInd','Age','Tonnage','passengers','length','cabins','passenger_density']
assembler = VectorAssembler(inputCols=cols,outputCol='features')

In [10]:
data = assembler.transform(ship_info_CruiseLineInd)
data = data.select('crew','features')

In [11]:
data.show(5)

+----+--------------------+
|crew|            features|
+----+--------------------+
|3.55|[16.0,6.0,30.2769...|
|3.55|[16.0,6.0,30.2769...|
| 6.7|[1.0,26.0,47.262,...|
|19.1|[1.0,11.0,110.0,2...|
|10.0|[1.0,17.0,101.353...|
+----+--------------------+
only showing top 5 rows



## Split the data

In [14]:
train_data, test_data = data.randomSplit([0.7, 0.3])

## Build a model

In [13]:
from pyspark.ml.regression import LinearRegression

In [15]:
lr = LinearRegression(featuresCol='features',labelCol='crew')
lr_model = lr.fit(train_data)

In [23]:
print('Train RMSE: {}'.format(lr_model.summary.rootMeanSquaredError))
print('Train R2: {}'.format(lr_model.summary.r2))
print('\n')
lr_model.summary.predictions.show(5)
lr_model.summary.predictions.describe().show()

Train RMSE: 1.0145250239232344
Train R2: 0.9246723625491197


+----+--------------------+-------------------+
|crew|            features|         prediction|
+----+--------------------+-------------------+
|0.59|[8.0,22.0,3.341,0...| 0.4843875118277927|
|0.59|[8.0,22.0,3.341,0...|0.48848073876214637|
| 0.6|[6.0,12.0,2.329,0...|  0.459919640469046|
|0.88|[14.0,25.0,5.35,1...| 1.4365964070407093|
|0.88|[14.0,27.0,5.35,1...| 1.3831875854900364|
+----+--------------------+-------------------+
only showing top 5 rows

+-------+-----------------+------------------+
|summary|             crew|        prediction|
+-------+-----------------+------------------+
|  count|              115|               115|
|   mean|7.620608695652175| 7.620608695652179|
| stddev|3.712633549362711|3.5700641790046306|
|    min|             0.59| 0.459919640469046|
|    max|             21.0|20.947382328835296|
+-------+-----------------+------------------+



## Evaluate the model

In [20]:
test_results = lr_model.evaluate(test_data)

In [29]:
print('Test RMSE: {}'.format(test_results.rootMeanSquaredError))
print('Test R2: {}'.format(test_results.r2))
print('\n')
test_results.predictions.show(5)
test_results.predictions.describe().show()

Test RMSE: 0.7815838419952341
Test R2: 0.9234221866694585


+----+--------------------+------------------+
|crew|            features|        prediction|
+----+--------------------+------------------+
| 1.8|[14.0,23.0,14.745...|2.8336980559223157|
| 2.1|[11.0,19.0,16.8,2...| 2.359252085704875|
|2.95|[11.0,13.0,25.0,3...|3.1275739587109967|
|3.73|[2.0,14.0,30.2769...|3.5529794388330656|
|3.85|[5.0,23.0,25.0,7....| 3.874175365633352|
+----+--------------------+------------------+
only showing top 5 rows

+-------+-----------------+------------------+
|summary|             crew|        prediction|
+-------+-----------------+------------------+
|  count|               43|                43|
|   mean|8.258372093023254| 8.485329564488934|
| stddev| 2.85781212537923|2.8345490577329913|
|    min|              1.8| 2.359252085704875|
|    max|             13.6|13.999458906457592|
+-------+-----------------+------------------+



## Note: train_data and test_data split did not reserve data stratification

## The model is ready to operate on unlabeled data!