In [41]:
# import modules
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# build spark session and spark context
spark = SparkSession.builder \
        .master("local[4]") \
        .appName("hotel") \
        .getOrCreate()
sc = spark.sparkContext

df = spark.read.csv('hotel_bookings.csv',  inferSchema=True, header = True)

In [42]:
# EDA

In [43]:
from pyspark.sql.functions import col

# replace the strings "NULL" and "NA" with null value
df_withNull = df.replace('NULL', None).replace('NA', None)

# replace null values in 'children' to 0 since there are only 4
df2 = df_withNull.fillna({'children':0})

# replace 'children' datatype to int
df2 = df2.withColumn('children', col('children').cast("Int"))

# drop 'company' and 'agent' due to high null count
df2 = df2.drop('agent', 'company','country', 'arrival_date_week_number')

In [44]:
# addressing reservation_status_date, which gives the date at which the last 
# reservation status was set. I'm transforming it into number of days since reservation
# status was set, so day of arrival - reservation_status_date
# from pyspark.sql.types import DateType

# convert reservation_status_date into datetype dtype
# temp = df2.withColumn("reservation_status_date", df2["reservation_status_date"].cast(DateType()))

# need to combine arrival_date_year, arrival_date_month, and arrival_date_day_of_month
# into one column and cast it to DateType, then replacing reservation_status_date column
# with number of days since reservation.
# until then, dropping reservation_status_date
df2 = df2 .drop('reservation_status_date')

In [45]:
#numerically encode all columns of type string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

# list of columns to numerically encode
col_string=['hotel', 'meal','market_segment',
            'distribution_channel','reserved_room_type','assigned_room_type',
            'deposit_type','customer_type','reservation_status']
col_stringwmonth = col_string+['arrival_date_month']


# col_num = list of new column names being changed into numeric
col_num=[x+"_NUMERIC" for x in col_string]
# don't need to change dates into numeric col_num=col_num+['arrival_date_year','arrival_date_day_of_month']

# col_oh = list of columns being one-hot encoded
col_oh=[x+"_oh" for x in col_string]
# dont need to ohencode dates col_oh=col_oh+['arrival_date_year_oh','arrival_date_day_of_month_oh']

In [46]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(df2) for column in col_stringwmonth]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df2).transform(df2)

In [47]:
#one-hot encode all columns in col_num
ohe = OneHotEncoder(dropLast=False)
ohe.setInputCols(col_num)
ohe.setOutputCols(col_oh)
model = ohe.fit(df_indexed)

df_casted=model.transform(df_indexed)

In [48]:
# drop the original, non-ohencoded variables
df_encoded=df_casted.drop(*col_stringwmonth)
df_encoded=df_encoded.drop(*col_num)

In [49]:
df_encoded.take(1)

[Row(is_canceled=0, lead_time=342, arrival_date_year=2015, arrival_date_day_of_month=1, stays_in_weekend_nights=0, stays_in_week_nights=0, adults=2, children=0, babies=0, is_repeated_guest=0, previous_cancellations=0, previous_bookings_not_canceled=0, booking_changes=3, days_in_waiting_list=0, adr=0.0, required_car_parking_spaces=0, total_of_special_requests=0, arrival_date_month_NUMERIC=1.0, distribution_channel_oh=SparseVector(5, {1: 1.0}), customer_type_oh=SparseVector(4, {0: 1.0}), reservation_status_oh=SparseVector(3, {0: 1.0}), market_segment_oh=SparseVector(8, {3: 1.0}), reserved_room_type_oh=SparseVector(10, {6: 1.0}), assigned_room_type_oh=SparseVector(12, {5: 1.0}), meal_oh=SparseVector(5, {0: 1.0}), hotel_oh=SparseVector(2, {1: 1.0}), deposit_type_oh=SparseVector(3, {0: 1.0}))]

In [94]:
# removed normalization step for now
# from pyspark.ml.feature import VectorAssembler
# from pyspark.mllib.linalg import Vectors
# from pyspark.ml.feature import Normalizer

# norm_to_columns = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 
#                 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adr', 
#                 'required_car_parking_spaces', 'total_of_special_requests']

# # combine all to-norm columns into one vector named "norm_features"
# assembler = VectorAssembler(inputCols=norm_to_columns, outputCol="norm_features")
# transformed = assembler.transform(df_encoded) 
# transformed = transformed.drop(*norm_to_columns)

# # in the end, new column named normFeatures combined all features that are normalized
# normalizer = Normalizer(inputCol="norm_features", outputCol="normFeatures", p=1.0)
# l1NormData = normalizer.transform(transformed)
# l1NormData = l1NormData.drop(*norm_to_columns + ['norm_features'])
# print("Normalized using L^1 norm")

Normalized using L^1 norm


['is_canceled',
 'arrival_date_year',
 'arrival_date_day_of_month',
 'is_repeated_guest',
 'days_in_waiting_list',
 'arrival_date_month_NUMERIC',
 'distribution_channel_oh',
 'customer_type_oh',
 'reservation_status_oh',
 'market_segment_oh',
 'reserved_room_type_oh',
 'assigned_room_type_oh',
 'meal_oh',
 'hotel_oh',
 'deposit_type_oh',
 'normFeatures']

In [115]:
# combine all feature columns (non-label columns) into one vector named "feature"
assembler = VectorAssembler(inputCols=df_encoded.columns[1:], outputCol="features")
all_transformed = assembler.transform(df_encoded)

# convert to rdd with 2 columns, label and features, where features is a DenseVector combination of all other features
dataRdd = all_transformed.select(col("is_canceled").alias("label"), col("features")).rdd.map(tuple)

In [116]:
dataRdd.take(1)

[(0,
  SparseVector(69, {0: 342.0, 1: 2015.0, 2: 1.0, 5: 2.0, 11: 3.0, 16: 1.0, 18: 1.0, 22: 1.0, 26: 1.0, 32: 1.0, 43: 1.0, 52: 1.0, 59: 1.0, 65: 1.0, 66: 1.0}))]

In [117]:
from pyspark.mllib.regression import LabeledPoint

# map features as floats, then input all into a DenseVector, as well as mapping labels as floats
# then map into LabeledPoint object
lp = dataRdd.map(lambda row: (float(row[0]), Vectors.dense([float(c) for c in row[1]])))\
            .map(lambda row: LabeledPoint(row[0], row[1]))
lp.take(1)

[LabeledPoint(0.0, [342.0,2015.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [121]:
# 70/30 test train split
seed = 42
train, test = lp.randomSplit([0.7, 0.3], seed=seed)

KeyboardInterrupt: 

In [None]:
# use these variables in place of test/train.count() bc those methods take a long time, instead just call once
test_count = test.count()
train_count = train.count()

In [119]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

# train logistic regression model with training data
model = LogisticRegressionWithLBFGS.train(train)

In [None]:
# Evaluating the model on training data
labelsAndPreds_tr = train.map(lambda p: (p.label, model.predict(p.features)))
accuracy_tr = 1.0 * labelsAndPreds_tr.filter(lambda pl: pl[0] == pl[1]).count() / train_count
print('model accuracy (train): {}'.format(accuracy_tr))

In [None]:
# Evaluating the model on test data
labelsAndPreds_te = test.map(lambda p: (p.label, model.predict(p.features)))
accuracy_te = 1.0 * labelsAndPreds_te.filter(lambda pl: pl[0] == pl[1]).count() / test_count
print('model accuracy (test): {}'.format(accuracy_te))