In [2]:
import findspark
findspark.init() 
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lr').getOrCreate()

In [11]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [5]:
cruise_ship = spark.read.csv('file:///home/manohar_l/Manohar_Personal_Projects/cruise_ship_info.csv', inferSchema=True,header=True)

In [6]:
cruise_ship.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [14]:
indexer = StringIndexer(inputCol='Cruise_line', outputCol='cruise_indexer')
cruise_indexed = indexer.fit(cruise_ship).transform(cruise_ship)
cruise_indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_indexer|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|          16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|           1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|           1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|           1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|        

In [15]:
cruise_indexed.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_indexer=16.0)]

In [55]:
cassembler = VectorAssembler(inputCols=['Tonnage','passengers','length','cabins','passenger_density','cruise_indexer'],
                            outputCol='features')

In [56]:
cruise = cassembler.transform(cruise_indexed)

In [57]:
cruise.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_indexer=16.0, features=DenseVector([30.277, 6.94, 5.94, 3.55, 42.64, 16.0]))]

In [58]:
cruise_final = cruise.select('features','crew')

In [32]:
cruise_final.show(truncate = False)

+----------------------------------------------+----+
|features                                      |crew|
+----------------------------------------------+----+
|[30.276999999999997,6.94,5.94,3.55,42.64,16.0]|3.55|
|[30.276999999999997,6.94,5.94,3.55,42.64,16.0]|3.55|
|[47.262,14.86,7.22,7.43,31.8,1.0]             |6.7 |
|[110.0,29.74,9.53,14.88,36.99,1.0]            |19.1|
|[101.353,26.42,8.92,13.21,38.36,1.0]          |10.0|
|[70.367,20.52,8.55,10.2,34.29,1.0]            |9.2 |
|[70.367,20.52,8.55,10.2,34.29,1.0]            |9.2 |
|[70.367,20.56,8.55,10.22,34.23,1.0]           |9.2 |
|[70.367,20.52,8.55,10.2,34.29,1.0]            |9.2 |
|[110.23899999999999,37.0,9.51,14.87,29.79,1.0]|11.5|
|[110.0,29.74,9.51,14.87,36.99,1.0]            |11.6|
|[46.052,14.52,7.27,7.26,31.72,1.0]            |6.6 |
|[70.367,20.52,8.55,10.2,34.29,1.0]            |9.2 |
|[70.367,20.52,8.55,10.2,34.29,1.0]            |9.2 |
|[86.0,21.24,9.63,10.62,40.49,1.0]             |9.3 |
|[110.0,29.74,9.51,14.87,36.

In [66]:
train_data, test_data = cruise_final.randomSplit([0.7,0.3])

In [60]:
train_data.describe().show()
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               113|
|   mean| 7.440000000000001|
| stddev|3.6000243054735055|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                45|
|   mean| 8.683555555555555|
| stddev|3.1107381143647586|
|    min|              3.24|
|    max|              19.1|
+-------+------------------+



In [61]:
lr = LinearRegression(labelCol='crew', featuresCol='features')

In [67]:
lr_cruise = lr.fit(train_data)

In [68]:
cruise_test = lr_cruise.evaluate(test_data)

In [69]:
cruise_test.rootMeanSquaredError

0.6401610529587018

In [70]:
cruise_test.r2

0.9605099691487997

In [40]:
from pyspark.sql.functions import corr

In [43]:
cruise_ship.select(corr('crew','Age')).show()
cruise_ship.select(corr('crew','Tonnage')).show()
cruise_ship.select(corr('crew','passengers')).show()
cruise_ship.select(corr('crew','cabins')).show()
cruise_ship.select(corr('crew','length')).show()
cruise_ship.select(corr('crew','passenger_density')).show()

+-------------------+
|    corr(crew, Age)|
+-------------------+
|-0.5306565039638852|
+-------------------+

+-------------------+
|corr(crew, Tonnage)|
+-------------------+
|  0.927568811544939|
+-------------------+

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+

+------------------+
|corr(crew, length)|
+------------------+
|0.8958566271016579|
+------------------+

+-----------------------------+
|corr(crew, passenger_density)|
+-----------------------------+
|         -0.15550928421699717|
+-----------------------------+

