In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear-regression-project').getOrCreate()


In [8]:
from pyspark.ml.regression import LinearRegression

In [4]:
df = spark.read.csv('cruise_ship_info.csv', inferSchema=True, header=True)

In [5]:
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [6]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [13]:
from pyspark.ml.feature import StringIndexer

In [16]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_index').fit(df)

In [18]:
new_df = indexer.transform(df)

In [19]:
new_df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_index|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|        16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|         1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|         1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|         1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|         1.0|
|    Elati

In [20]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [22]:
new_df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_index']

In [23]:
features_list = ['Cruise_index','Age','Tonnage','passengers','length','cabins','passenger_density']

In [27]:
assembler = VectorAssembler(inputCols=features_list, outputCol='features')

In [28]:
output = assembler.transform(new_df)

In [30]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[16.0,6.0,30.2769...|
|[16.0,6.0,30.2769...|
|[1.0,26.0,47.262,...|
|[1.0,11.0,110.0,2...|
|[1.0,17.0,101.353...|
|[1.0,22.0,70.367,...|
|[1.0,15.0,70.367,...|
|[1.0,23.0,70.367,...|
|[1.0,19.0,70.367,...|
|[1.0,6.0,110.2389...|
|[1.0,10.0,110.0,2...|
|[1.0,28.0,46.052,...|
|[1.0,18.0,70.367,...|
|[1.0,17.0,70.367,...|
|[1.0,11.0,86.0,21...|
|[1.0,8.0,110.0,29...|
|[1.0,9.0,88.5,21....|
|[1.0,15.0,70.367,...|
|[1.0,12.0,88.5,21...|
|[1.0,20.0,70.367,...|
+--------------------+
only showing top 20 rows



In [31]:
final_data = output.select(['features','crew'])

In [32]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [33]:
train_df,test_df = final_data.randomSplit([0.7,0.3])

In [34]:
train_df.describe().show()
test_df.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.451545454545459|
| stddev|3.4498408915215237|
|    min|               0.6|
|    max|              19.1|
+-------+------------------+

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean| 8.579374999999997|
| stddev|3.5350027272853546|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [35]:
lr = LinearRegression(labelCol='crew')

In [36]:
lrModel = lr.fit(train_df)

In [57]:
lrModel.intercept

-0.6558868944107011

In [38]:
lrModel.coefficients

DenseVector([0.0283, -0.014, 0.0033, -0.1215, 0.3429, 0.8623, 0.0025])

In [58]:
test_results = lrModel.evaluate(test_df)


In [60]:
test_results.rootMeanSquaredError

1.0118776098629083

In [64]:
train_df.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               110|
|   mean| 7.451545454545459|
| stddev|3.4498408915215237|
|    min|               0.6|
|    max|              19.1|
+-------+------------------+



In [44]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| 0.11904806165468784|
| -1.0556943008734176|
| -0.4086919418087849|
|  -1.036994480442722|
| -0.6091555888177176|
|  -0.765322058993827|
| -0.9543071350735843|
| -0.5819278505411081|
| -0.5713648402808946|
| 0.28379663273029543|
|  0.4876076954491264|
| -0.8011150261766922|
|  0.5435748914489569|
|  0.5855502884488288|
| -0.6953599819847014|
|-0.09493928384953776|
| 0.19302191194964635|
| 0.15085870909550803|
|  0.5247316619000912|
|-0.14872796968935997|
+--------------------+
only showing top 20 rows



In [45]:
test_results.rootMeanSquaredError

1.0118776098629083

In [46]:
test_results.r2

0.91632035183097

In [47]:
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [50]:
prediction_data = lrModel.transform(final_data).show()

+--------------------+----+------------------+
|            features|crew|        prediction|
+--------------------+----+------------------+
|[16.0,6.0,30.2769...|3.55| 4.174986181801763|
|[16.0,6.0,30.2769...|3.55| 4.174986181801763|
|[1.0,26.0,47.262,...| 6.7| 6.320128124453644|
|[1.0,11.0,110.0,2...|19.1|12.155432607182151|
|[1.0,17.0,101.353...|10.0|10.801115026176692|
|[1.0,22.0,70.367,...| 9.2|  8.61444971155117|
|[1.0,15.0,70.367,...| 9.2| 8.712392304550873|
|[1.0,23.0,70.367,...| 9.2|  8.61268891400176|
|[1.0,19.0,70.367,...| 9.2| 8.656425108551042|
|[1.0,6.0,110.2389...|11.5|11.310029068753321|
|[1.0,10.0,110.0,2...|11.6|12.153944252541192|
|[1.0,28.0,46.052,...| 6.6| 6.199873098690966|
|[1.0,18.0,70.367,...| 9.2| 8.670416907551001|
|[1.0,17.0,70.367,...| 9.2| 8.684408706550958|
|[1.0,11.0,86.0,21...| 9.3| 9.480125700156245|
|[1.0,8.0,110.0,29...|11.6|12.181927850541108|
|[1.0,9.0,88.5,21....|10.3| 9.519274033570783|
|[1.0,15.0,70.367,...| 9.2| 8.712392304550873|
|[1.0,12.0,88

In [66]:
from pyspark.sql.functions import corr

In [69]:
for column in df.columns:
    df.select(corr('crew',column)).show()

+---------------------+
|corr(crew, Ship_name)|
+---------------------+
|                  NaN|
+---------------------+

+-----------------------+
|corr(crew, Cruise_line)|
+-----------------------+
|                   null|
+-----------------------+

+-------------------+
|    corr(crew, Age)|
+-------------------+
|-0.5306565039638852|
+-------------------+

+-------------------+
|corr(crew, Tonnage)|
+-------------------+
|  0.927568811544939|
+-------------------+

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

+-----------------------------+
|corr(crew, passenger_density)|
+-----------------------------+
|         -0.15550928421699717|
+-----------------------------+

+----------------+
|corr(cr