In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [2]:
# create spark session
spark = SparkSession.builder.appName('spork').getOrCreate()

# read in data file
data = spark.read.csv('data/cruise_ship_info.csv', header=True, inferSchema=True)

In [3]:
# show schema and display the data
data.printSchema()
data.show()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|  

In [6]:
# split the data into train and test sets
train_data, test_data = data.randomSplit([0.8, 0.2])

print("Total:\t", data.count())
print("Train:\t", train_data.count())
print("Test:\t", test_data.count())

Total:	 158
Train:	 134
Test:	 24


In [7]:
print(train_data.columns)

# can only assemble numerical vals
assembler = VectorAssembler(
    inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density'],
    outputCol='features'
)

# build up a vector assembled features object for building model
train_data_with_features = assembler.transform(train_data)
test_data_with_features = assembler.transform(test_data)

# show new df cols
print(train_data_with_features.columns)
print(test_data_with_features.columns)

['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew']
['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew', 'features']
['Ship_name', 'Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew', 'features']


In [9]:
# create a linearregression instance
lr = LinearRegression(labelCol='crew')

# build the model by fitting the training data with features col to the LR instance
lr_model = lr.fit(train_data_with_features)

In [12]:
# test the model against the training data with features col
results = lr_model.evaluate(train_data_with_features)

# show variance actual vs expected
results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -1.2420662385651156|
|-0.12569009765590167|
|  -0.654454941021962|
| -0.4656661891687879|
| 0.13535735089372092|
|0.045754518356923235|
| -0.3717057524545808|
| -0.2523609714647037|
| 0.27992172312657715|
|  0.7753994383342997|
|-0.02400435878797...|
|   6.945162880043229|
|  0.8521085477140442|
|-0.32697804443130885|
|-0.03485257189962354|
|  -0.655508907639268|
|  0.9774058101225638|
| -0.8559496911794238|
|  0.9670754771871337|
| -1.2042603302815307|
+--------------------+
only showing top 20 rows

