In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
spark = SparkSession.builder.appName('Lr2').getOrCreate()


In [0]:
data = spark.read.options(header=True, inferSchema=True).csv('/FileStore/tables/cruise_ship_info.csv')

In [0]:
data.show(n=5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [0]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [0]:
#converting cruise_line using StringIndexer
from pyspark.ml.feature import StringIndexer
index = StringIndexer(inputCol='Cruise_line',outputCol='cruise_line_indexed')
df = index.fit(data).transform(data)

In [0]:
df.columns

Out[10]: ['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'cruise_line_indexed']

In [0]:
df.show(n=5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_line_indexed|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
+-----------+-----------+---+------------------+----------+-----

In [0]:
#setting up the data for training 
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['Age',
                                         'Tonnage',
                                         'passengers',
                                         'length',
                                         'cabins',
                                         'passenger_density',
                                         'cruise_line_indexed'],
                            outputCol='features')
output = assembler.transform(df)
output.show(n=5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_line_indexed|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|     

In [0]:
#preparing final data which includes feature vector and predicted column that is crew
final_data = output.select('features', 'crew')
final_data.show(n=5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
+--------------------+----+
only showing top 5 rows

Exception ignored in: <function JavaWrapper.__del__ at 0x7ff21625d280>
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7ff21625d280>
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7ff21625d280

In [0]:
#data shape
print(final_data.count(), len(final_data.columns))

158 2


In [0]:
#split train and test
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [0]:
train_data.show(n=5)

+--------------------+-----+
|            features| crew|
+--------------------+-----+
|[5.0,115.0,35.74,...| 12.2|
|[5.0,133.5,39.59,...|13.13|
|[5.0,160.0,36.34,...| 13.6|
|[6.0,30.276999999...| 3.55|
|[6.0,30.276999999...| 3.55|
+--------------------+-----+
only showing top 5 rows



In [0]:
test_data.show(n=5)

+--------------------+-----+
|            features| crew|
+--------------------+-----+
|[4.0,220.0,54.0,1...| 21.0|
|[5.0,86.0,21.04,9...|  8.0|
|[5.0,122.0,28.5,1...|  6.7|
|[6.0,90.0,20.0,9....|  9.0|
|[6.0,93.0,23.94,9...|11.09|
+--------------------+-----+
only showing top 5 rows



In [0]:
#initiate the model 
lr = LinearRegression(labelCol='crew', featuresCol='features')

In [0]:
lr_model = lr.fit(train_data)

In [0]:
test_results = lr_model.evaluate(test_data)
test_results.residuals.show(n=5)

+--------------------+
|           residuals|
+--------------------+
|0.046443557309441275|
|  -1.236431397180521|
|    0.50133348249439|
| -0.9701838195914849|
|  0.5610102668936694|
+--------------------+
only showing top 5 rows



In [0]:
test_results.rootMeanSquaredError

Out[25]: 0.934727935392464

In [0]:
test_results.r2

Out[26]: 0.9312700020578704