In [1]:
# import modules
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# build spark session and spark context
spark = SparkSession.builder \
        .master("local[4]") \
        .appName("hotel") \
        .getOrCreate()
sc = spark.sparkContext

df = spark.read.csv('hotel_bookings.csv',  inferSchema=True, header = True)

In [2]:
# EDA

In [3]:
from pyspark.sql.functions import col

# replace the strings "NULL" and "NA" with null value
df_withNull = df.replace('NULL', None).replace('NA', None)

# replace null values in 'children' to 0 since there are only 4
df2 = df_withNull.fillna({'children':0})

# replace 'children' datatype to int
df2 = df2.withColumn('children', col('children').cast("Int"))
df2 = df2.withColumn('arrival_date_year', col('arrival_date_year').cast("Int")-2015)

# drop 'company' and 'agent' due to high null count
df2 = df2.drop('agent', 'company','country', 'arrival_date_week_number')

In [4]:
# addressing reservation_status_date, which gives the date at which the last 
# reservation status was set. I'm transforming it into number of days since reservation
# status was set, so day of arrival - reservation_status_date
# from pyspark.sql.types import DateType

# convert reservation_status_date into datetype dtype
# temp = df2.withColumn("reservation_status_date", df2["reservation_status_date"].cast(DateType()))

# need to combine arrival_date_year, arrival_date_month, and arrival_date_day_of_month
# into one column and cast it to DateType, then replacing reservation_status_date column
# with number of days since reservation.
# until then, dropping reservation_status_date
df2 = df2 .drop('reservation_status_date','reservation_status')

In [5]:
#numerically encode all columns of type string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

# list of columns to numerically encode
col_string=['hotel', 'meal','market_segment',
            'distribution_channel','reserved_room_type','assigned_room_type',
            'deposit_type','customer_type','arrival_date_month']

# col_num = list of new column names being changed into numeric
col_num=[x+"_NUMERIC" for x in col_string]+['arrival_date_year']
# don't need to change dates into numeric col_num=col_num+['arrival_date_year','arrival_date_day_of_month']

# col_oh = list of columns being one-hot encoded
col_oh=[x+"_oh" for x in col_string]+['arrival_date_year_oh']
# dont need to ohencode dates col_oh=col_oh+['arrival_date_year_oh','arrival_date_day_of_month_oh']

In [6]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(df2) for column in col_string]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df2).transform(df2)


In [7]:
df_indexed.dtypes

[('hotel', 'string'),
 ('is_canceled', 'int'),
 ('lead_time', 'int'),
 ('arrival_date_year', 'int'),
 ('arrival_date_month', 'string'),
 ('arrival_date_day_of_month', 'int'),
 ('stays_in_weekend_nights', 'int'),
 ('stays_in_week_nights', 'int'),
 ('adults', 'int'),
 ('children', 'int'),
 ('babies', 'int'),
 ('meal', 'string'),
 ('market_segment', 'string'),
 ('distribution_channel', 'string'),
 ('is_repeated_guest', 'int'),
 ('previous_cancellations', 'int'),
 ('previous_bookings_not_canceled', 'int'),
 ('reserved_room_type', 'string'),
 ('assigned_room_type', 'string'),
 ('booking_changes', 'int'),
 ('deposit_type', 'string'),
 ('days_in_waiting_list', 'int'),
 ('customer_type', 'string'),
 ('adr', 'double'),
 ('required_car_parking_spaces', 'int'),
 ('total_of_special_requests', 'int'),
 ('hotel_NUMERIC', 'double'),
 ('meal_NUMERIC', 'double'),
 ('market_segment_NUMERIC', 'double'),
 ('distribution_channel_NUMERIC', 'double'),
 ('reserved_room_type_NUMERIC', 'double'),
 ('assigned_ro

In [8]:
#one-hot encode all columns in col_num
ohe = OneHotEncoder(dropLast=False)
ohe.setInputCols(col_num)
ohe.setOutputCols(col_oh)
model = ohe.fit(df_indexed)

df_casted=model.transform(df_indexed)

In [9]:
# drop the original, non-ohencoded variables
df_encoded=df_casted.drop(*col_string)
df_encoded=df_encoded.drop(*col_num)

In [10]:
df_encoded.take(1)

[Row(is_canceled=0, lead_time=342, arrival_date_day_of_month=1, stays_in_weekend_nights=0, stays_in_week_nights=0, adults=2, children=0, babies=0, is_repeated_guest=0, previous_cancellations=0, previous_bookings_not_canceled=0, booking_changes=3, days_in_waiting_list=0, adr=0.0, required_car_parking_spaces=0, total_of_special_requests=0, distribution_channel_oh=SparseVector(5, {1: 1.0}), customer_type_oh=SparseVector(4, {0: 1.0}), market_segment_oh=SparseVector(8, {3: 1.0}), reserved_room_type_oh=SparseVector(10, {6: 1.0}), assigned_room_type_oh=SparseVector(12, {5: 1.0}), meal_oh=SparseVector(5, {0: 1.0}), hotel_oh=SparseVector(2, {1: 1.0}), deposit_type_oh=SparseVector(3, {0: 1.0}), arrival_date_year_oh=SparseVector(3, {0: 1.0}), arrival_date_month_oh=SparseVector(12, {1: 1.0}))]

In [11]:
#removed normalization step for now
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import Normalizer

norm_to_columns = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 
                'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adr', 
                'required_car_parking_spaces', 'total_of_special_requests']

# combine all to-norm columns into one vector named "norm_features"
assembler = VectorAssembler(inputCols=norm_to_columns, outputCol="norm_features")
transformed = assembler.transform(df_encoded) 
transformed = transformed.drop(*norm_to_columns)

# in the end, new column named normFeatures combined all features that are normalized
normalizer = Normalizer(inputCol="norm_features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(transformed)
l1NormData = l1NormData.drop(*norm_to_columns + ['norm_features'])
print("Normalized using L^1 norm")

Normalized using L^1 norm


In [12]:
# combine all feature columns (non-label columns) into one vector named "feature"
assembler = VectorAssembler(inputCols=df_encoded.columns[1:], outputCol="features")
all_transformed = assembler.transform(df_encoded)

# convert to rdd with 2 columns, label and features, where features is a DenseVector combination of all other features
dataRdd = all_transformed.select(col("is_canceled").alias("label"), col("features")).rdd.map(tuple)

In [13]:
dataRdd.take(1)

[(0,
  SparseVector(79, {0: 342.0, 1: 1.0, 4: 2.0, 10: 3.0, 16: 1.0, 20: 1.0, 27: 1.0, 38: 1.0, 47: 1.0, 54: 1.0, 60: 1.0, 61: 1.0, 64: 1.0, 68: 1.0}))]

In [14]:
from pyspark.mllib.regression import LabeledPoint

# map features as floats, then input all into a DenseVector, as well as mapping labels as floats
# then map into LabeledPoint object
lp = dataRdd.map(lambda row: (float(row[0]), Vectors.dense([float(c) for c in row[1]])))\
            .map(lambda row: LabeledPoint(row[0], row[1]*1.0))
lp.take(1)

[LabeledPoint(0.0, [342.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])]

In [15]:
# 70/30 test train split
seed = 42
train, test = lp.randomSplit([0.7, 0.3], seed=seed)

In [16]:
# use these variables in place of test/train.count() bc those methods take a long time, instead just call once
test_count = test.count()
train_count = train.count()

In [17]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS


from pyspark.mllib.classification import SVMWithSGD
# train logistic regression model with training data

model_lr = LogisticRegressionWithLBFGS.train(train,intercept=True)
# model = SVMWithSGD.train(train,iterations=20)

In [18]:
# Evaluating the model on test data
labelsAndPreds_te1 = test.map(lambda p: (p.label, float(model_lr.predict(p.features))))
accuracy_te1 = 1.0 * labelsAndPreds_te1.filter(lambda pl: pl[0] == pl[1]).count() / test_count
print('model accuracy (test): {}'.format(accuracy_te1))

model accuracy (test): 0.8120925547690327


In [19]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrix = MulticlassMetrics(labelsAndPreds_te1)
metrix.confusionMatrix().toArray()

array([[20854.,  5024.],
       [ 1692.,  8171.]])

In [20]:
labelsAndPreds_te1.take(10)

[(1.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0),
 (1.0, 0.0),
 (0.0, 0.0),
 (0.0, 0.0)]

In [21]:

model_svm = SVMWithSGD.train(train,iterations=10)

labelsAndPreds_te2 = test.map(lambda p: (p.label, float(model_svm.predict(p.features))))
accuracy_te2 = 1.0 * labelsAndPreds_te2.filter(lambda pl: pl[0] == pl[1]).count() / test_count
print('model accuracy (test): {}'.format(accuracy_te2))

model accuracy (test): 0.5898268095464593


In [22]:
metrix = MulticlassMetrics(labelsAndPreds_te2)
metrix.confusionMatrix().toArray()

array([[11488.,  3602.],
       [11058.,  9593.]])

In [26]:
from pyspark.mllib.tree import RandomForest

In [29]:
model_rf = RandomForest.trainClassifier(train, numClasses=2,numTrees=3,categoricalFeaturesInfo={})

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:34245)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:34245)

In [None]:
predictions = model_rf.predict(test.map(lambda x: x.features))


In [None]:
labelsAndPreds_te3 = test.map(lambda lp: lp.label).zip(predictions)

In [None]:
labelsAndPreds_te3.take(1)

In [None]:
accuracy_te3 = labelsAndPreds_te3.filter(lambda lp: lp[0] != lp[1]).count() 
print('model accuracy (test): {}'.format(accuracy_te3))

In [None]:
metrix = MulticlassMetrics(labelsAndPreds_te3)
metrix.confusionMatrix().toArray()

In [23]:
from pyspark.mllib.classification import NaiveBayes

In [24]:
model_nb = NaiveBayes.train(train)

predictions = model_nb.predict(test.map(lambda x: x.features))



In [25]:
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 167.0 failed 1 times, most recent failure: Lost task 2.0 in stage 167.0 (TID 800, jupyter-fei-5fxu, executor driver): org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$3018/396280604: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:156)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:148)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:148)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)
Caused by: org.apache.spark.SparkException: Unseen label: N. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:405)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:390)
	... 14 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2114)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2133)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2158)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function(StringIndexerModel$$Lambda$3018/396280604: (string) => double)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:156)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.next(SerDeUtil.scala:148)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.api.python.SerDeUtil$AutoBatchedPickler.foreach(SerDeUtil.scala:148)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:295)
	at org.apache.spark.api.python.PythonRunner$$anon$2.writeIteratorToStream(PythonRunner.scala:607)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:383)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1932)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:218)
Caused by: org.apache.spark.SparkException: Unseen label: N. To handle unseen labels, set Param handleInvalid to keep.
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1(StringIndexer.scala:405)
	at org.apache.spark.ml.feature.StringIndexerModel.$anonfun$getIndexer$1$adapted(StringIndexer.scala:390)
	... 14 more


In [26]:
pred_list=predictions.collect()

In [29]:
label_list=test.map(lambda lp: lp.label).collect()