In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import corr

spark = SparkSession.builder.appName('ConsProject').getOrCreate()

In [2]:
df = spark.read.csv('/FileStore/tables/Machine_learning/cruise_ship_info.csv', inferSchema = True, header = True)

In [3]:
#df.printSchema()
#df.show()
for ship in df.head(5):
  print(ship)
  print('\n')

In [4]:
df.groupBy('Cruise_line').count().show()

In [5]:
indexer = StringIndexer(inputCol= 'Cruise_line', outputCol= 'cruise_cat')
indexed = indexer.fit(df).transform(df)
indexed.head(1)

In [6]:
indexed.columns

In [7]:
assembler = VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'cruise_cat'], outputCol = 'features')

In [8]:
output = assembler.transform(indexed)

In [9]:
output.select('features', 'crew').show()

In [10]:
final_data = output.select('features', 'crew')

In [11]:
train_data,test_data = final_data.randomSplit([0.7,0.3])
train_data.describe().show()
test_data.describe().show()

In [12]:
shipLr = LinearRegression(labelCol='crew')
trained_ship_model = shipLr.fit(train_data)

In [13]:
ship_results = trained_ship_model.evaluate(test_data)

In [14]:
print ("root mean squared error", ship_results.rootMeanSquaredError)
print ("r2", ship_results.r2)
print (ship_results.residuals.show())
print ("mean absolute error", ship_results.meanAbsoluteError)
print ("mean squared error", ship_results.meanSquaredError)

In [15]:
print (df.select(corr('crew', 'passengers')).show())
print (df.select(corr('crew', 'cabins')).show())