In [1]:
import findspark
findspark.init()

In [2]:
# import libraries
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from datetime import datetime
from pyspark.sql.functions import mean, stddev, col, log
from pyspark.sql.functions import to_date, dayofweek, to_timestamp
from pyspark.sql import types
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month
from pyspark.sql.functions import dayofmonth, weekofyear
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import coalesce, first, lit
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import Bucketizer
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.functions import datediff
from pyspark.sql.functions import when

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel

In [3]:
sc =SparkContext() 

In [4]:
spark = SparkSession(sc)

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv("cruise_ship_info.csv", inferSchema=True,header=True)

In [6]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [7]:
print((data.count(), len(data.columns)))

(158, 9)


In [8]:
# Khi in bằng head thì định dạng hiển thị là row (khác với head ở pandas dataframe)
data.head(1)

[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)]

In [9]:
data.show(3)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 3 rows



In [10]:
for item in data.head():
    print(item)

Journey
Azamara
6
30.276999999999997
6.94
5.94
3.55
42.64
3.55


In [12]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [13]:
#create Indexer
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_inx')
indexer_model = indexer.fit(data)
data_indexed = indexer_model.transform(data)

In [15]:
# Create encoder
data_indexed = OneHotEncoder(inputCol='Cruise_line_inx', 
                             outputCol='Cruise_line_vec',
                            dropLast=True).fit(data_indexed).transform(data_indexed)

IllegalArgumentException: requirement failed: Column Cruise_line_vec already exists.

In [16]:
data_indexed.show(3)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_inx|Cruise_line_vec|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|(19,[16],[1.0])|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|(19,[16],[1.0])|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0| (19,[1],[1.0])|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+
only showing top 3 rows



In [17]:
data_indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_inx',
 'Cruise_line_vec']

In [18]:
assembler = VectorAssembler(
    inputCols=['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'Cruise_line_vec'],
    outputCol='features'
)

In [19]:
data_pre = assembler.transform(data_indexed)

In [20]:
data_pre.show(2)

+---------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+--------------------+
|Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_inx|Cruise_line_vec|            features|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+--------------------+
|  Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|(19,[16],[1.0])|(25,[0,1,2,3,4,5,...|
|    Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|(19,[16],[1.0])|(25,[0,1,2,3,4,5,...|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+---------------+--------------------+
only showing top 2 rows



In [22]:
final_data = data_pre.select('features','crew')

In [23]:
final_data.count()

158

In [24]:
final_data = final_data.na.drop()
final_data.count()

158

In [25]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [26]:
lr = LinearRegression(featuresCol='features',
                     labelCol='crew',
                     predictionCol='prediction')

In [27]:
lrModel = lr.fit(train_data)

In [28]:
print('Coefficiets: {} Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficiets: [0.01193601099865659,0.0265785689999089,-0.06313323784047403,0.5848532017055212,0.5121674957223757,-0.012893168143103286,-0.7008259814104867,0.5633849797830629,0.5445237396189585,0.01370701991273748,1.0894763386411488,0.12114411137157945,1.0987455187128607,0.4448577675206112,0.4227814312866429,2.204152559493914,1.1144955405634698,0.9678427464129145,0.27857293970338676,1.1518717225208082,1.0796022201060052,0.3533635024564414,0.7201830977912582,0.840460278143703,1.006140693980459] Intercept: -2.3508317651038575


In [29]:
test_result =lrModel.evaluate(test_data)

In [30]:
test_result.residuals.show(5)

+--------------------+
|           residuals|
+--------------------+
|-0.19284620996974766|
| -0.1287513047466895|
|1.906031509104622E-4|
|   6.992305508732114|
|  0.4511499006874544|
+--------------------+
only showing top 5 rows



In [31]:
print('RMSE:', test_result.rootMeanSquaredError)
print('MSE:', test_result.meanSquaredError)
print('r2:', test_result.r2)

RMSE: 1.4670179548618123
MSE: 2.152141679886934
r2: 0.8771672048630315


In [32]:
test_model = lrModel.transform(test_data)

In [33]:
test_model.select('prediction','crew').show(5)

+------------------+----+
|        prediction|crew|
+------------------+----+
|13.792846209969747|13.6|
| 13.72875130474669|13.6|
|  8.21980939684909|8.22|
|12.107694491267887|19.1|
| 8.748850099312545| 9.2|
+------------------+----+
only showing top 5 rows



In [34]:
from pyspark.ml.evaluation import RegressionEvaluator

In [35]:
RegressionEvaluator(labelCol='crew').evaluate(test_model)

1.4670179548618123