In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Lab07").getOrCreate()

In [3]:
df = spark.read.csv("data.csv" , inferSchema = True , header = True)

In [5]:
df.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [6]:
from pyspark.ml.feature import StringIndexer

In [7]:
indexer = StringIndexer(inputCol = 'Cruise_line' , outputCol='cruise_cat')

In [8]:
indexed = indexer.fit(df).transform(df)

In [9]:
indexed.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|cruise_cat|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+----------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|      16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|       1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|       1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|       1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|       1.0|
|    Elation|   Carnival| 15

In [10]:
from pyspark.ml.linalg import Vector

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols = ['Age' , 'Tonnage' , 'passengers' , 'length' , 'cabins' , 'passenger_density' , 'cruise_cat'] , outputCol='features')

In [13]:
output = assembler.transform(indexed)

In [15]:
output.select('features').show()

+--------------------+
|            features|
+--------------------+
|[6.0,30.276999999...|
|[6.0,30.276999999...|
|[26.0,47.262,14.8...|
|[11.0,110.0,29.74...|
|[17.0,101.353,26....|
|[22.0,70.367,20.5...|
|[15.0,70.367,20.5...|
|[23.0,70.367,20.5...|
|[19.0,70.367,20.5...|
|[6.0,110.23899999...|
|[10.0,110.0,29.74...|
|[28.0,46.052,14.5...|
|[18.0,70.367,20.5...|
|[17.0,70.367,20.5...|
|[11.0,86.0,21.24,...|
|[8.0,110.0,29.74,...|
|[9.0,88.5,21.24,9...|
|[15.0,70.367,20.5...|
|[12.0,88.5,21.24,...|
|[20.0,70.367,20.5...|
+--------------------+
only showing top 20 rows



In [16]:
final_data = output.select('features' , 'crew')

In [18]:
train_data , test_data = final_data.randomSplit([0.7 , 0.3])

In [20]:
from pyspark.ml.regression import LinearRegression


In [21]:
model = LinearRegression(featuresCol='features' , labelCol='crew')

In [22]:
trained_model = model.fit(train_data)

In [23]:
unlabeled_data = test_data.select('features')

In [24]:
pred = trained_model.transform(unlabeled_data)

In [25]:
pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[5.0,160.0,36.34,...|15.315729500368137|
|[6.0,30.276999999...| 4.297786869468912|
|[6.0,93.0,23.94,9...|10.566973691533564|
|[6.0,112.0,38.0,9...| 11.46270270950132|
|[6.0,158.0,43.7,1...|14.094225345849447|
|[8.0,91.0,22.44,9...|10.059070901294442|
|[9.0,59.058,17.0,...| 7.617135848224838|
|[9.0,88.5,21.24,9...| 9.493317540204004|
|[9.0,113.0,26.74,...|11.451687785416006|
|[9.0,116.0,26.0,9...|11.257674982280781|
|[10.0,46.0,7.0,6....|2.6794117336306336|
|[10.0,77.0,20.16,...|  8.77780406857378|
|[10.0,105.0,27.2,...|11.430208727854147|
|[10.0,138.0,31.14...| 13.15188743394511|
|[10.0,151.4,26.2,...|11.141396304306905|
|[11.0,58.6,15.66,...|  7.34576908732067|
|[11.0,91.0,20.32,...| 9.099245378330815|
|[11.0,138.0,31.14...|13.137926831169095|
|[12.0,50.0,7.0,7....| 4.325657814570986|
|[12.0,77.104,20.0...| 8.792310483991956|
+--------------------+------------

In [26]:
results = trained_model.evaluate(train_data)

In [27]:
results.r2

0.9155080558014457