In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import corr
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer
spark = SparkSession.builder.appName("lrg").getOrCreate()
full_data = spark.read.options(header=True,inferSchema=True).csv("/FileStore/tables/cruise_ship_info.csv")
full_data.printSchema()
full_data.show()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|  

In [0]:
#Feature Engineering. Indexing the cruise line names pulling the features.
indexer = StringIndexer(inputCol='Cruise_line',outputCol='cruise_category')
full_data = indexer.fit(full_data).transform(full_data)
features = full_data.select('cruise_category','Age','Tonnage','passengers','length','cabins','passenger_density').columns

In [0]:
#Organizing data for the model
assembler = VectorAssembler(inputCols=features,outputCol='features')
full_data = assembler.transform(full_data)
full_data = full_data.select('features','crew')
full_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [0]:
train_data,test_data = full_data.randomSplit([0.7,0.3])
train_data.printSchema()
train_data.describe().show()
test_data.printSchema()
test_data.describe().show()

root
 |-- features: vector (nullable = true)
 |-- crew: double (nullable = true)

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               112|
|   mean| 7.714821428571432|
| stddev|3.5264700422399677|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+

root
 |-- features: vector (nullable = true)
 |-- crew: double (nullable = true)

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                46|
|   mean| 7.987391304347829|
| stddev|3.4777581762346768|
|    min|               0.6|
|    max|              13.6|
+-------+------------------+



In [0]:
lr = LinearRegression(featuresCol='features',labelCol='crew')
lr_model = lr.fit(train_data)
results = lr_model.evaluate(test_data)
results.residuals.show()
print("r2: ", results.r2, end="--> ")
print("root mean squared error: ", results.rootMeanSquaredError, end="--> ")

+--------------------+
|           residuals|
+--------------------+
|  -2.005850903750888|
|-0.48191630177634615|
| -1.5952166820418636|
| -1.5902459819695078|
|  -1.585275281897152|
| -1.6703045818247961|
|  -1.503876670805509|
| -0.7019427446620359|
|  0.9642377499981674|
| -0.7626854952894622|
|  0.6644907526019459|
| -0.7527440951447506|
|-0.27717058710647535|
|  0.7238470516546975|
|  0.5204712304799664|
|  0.5204712304799664|
|  0.3360514255301297|
|   0.741687327196237|
| -0.4930715626120161|
|  0.3745671684959788|
+--------------------+
only showing top 20 rows

r2:  0.9450491168804431--> root mean squared error:  0.8063323045996659--> 

In [0]:
unlabeled_data = test_data.select('features')
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[0.0,5.0,160.0,36...|
|[0.0,7.0,158.0,43...|
|[0.0,10.0,138.0,3...|
|[0.0,11.0,138.0,3...|
|[0.0,12.0,138.0,3...|
|[0.0,13.0,138.0,3...|
|[0.0,15.0,78.491,...|
|[0.0,22.0,73.941,...|
|[0.0,23.0,48.563,...|
|[1.0,8.0,110.0,29...|
|[1.0,9.0,88.5,21....|
|[1.0,10.0,110.0,2...|
|[1.0,11.0,86.0,21...|
|[1.0,12.0,88.5,21...|
|[1.0,15.0,70.367,...|
|[1.0,15.0,70.367,...|
|[2.0,6.0,113.0,37...|
|[2.0,9.0,113.0,26...|
|[2.0,9.0,116.0,26...|
|[2.0,18.0,77.499,...|
+--------------------+
only showing top 20 rows



In [0]:
predictions = lr_model.transform(unlabeled_data)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,5.0,160.0,36...|15.605850903750888|
|[0.0,7.0,158.0,43...|14.081916301776346|
|[0.0,10.0,138.0,3...|13.445216682041863|
|[0.0,11.0,138.0,3...|13.440245981969507|
|[0.0,12.0,138.0,3...|13.435275281897152|
|[0.0,13.0,138.0,3...|13.430304581824796|
|[0.0,15.0,78.491,...| 8.103876670805509|
|[0.0,22.0,73.941,...| 8.921942744662037|
|[0.0,23.0,48.563,...| 5.745762250001833|
|[1.0,8.0,110.0,29...|12.362685495289462|
|[1.0,9.0,88.5,21....| 9.635509247398055|
|[1.0,10.0,110.0,2...| 12.35274409514475|
|[1.0,11.0,86.0,21...| 9.577170587106476|
|[1.0,12.0,88.5,21...| 9.566152948345302|
|[1.0,15.0,70.367,...| 8.679528769520033|
|[1.0,15.0,70.367,...| 8.679528769520033|
|[2.0,6.0,113.0,37...| 11.66394857446987|
|[2.0,9.0,113.0,26...|11.638312672803764|
|[2.0,9.0,116.0,26...|11.493071562612016|
|[2.0,18.0,77.499,...| 8.625432831504021|
+--------------------+------------

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-3471915613616983>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mfull_data[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0mcorr[0m[0;34m([0m[0;34m'crew'[0m[0;34m,[0m[0;34m'Cruise_line'[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mshow[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py[0m in [0;36mwrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m             [0mstart[0m [0;34m=[0m [0mtime[0m[0;34m.[0m[0mperf_counter[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     47[0m             [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 48[0;31m                 [0mres[0m [0;34m=[0m [0mfunc[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0