In [1]:
# import modules
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# build spark session and spark context
spark = SparkSession.builder \
        .master("local[4]") \
        .appName("hotel") \
        .getOrCreate()
sc = spark.sparkContext

df = spark.read.csv('hotel_bookings.csv',  inferSchema=True, header = True)

In [42]:
# EDA

In [2]:
df.groupBy('is_canceled').count().show()

+-----------+-----+
|is_canceled|count|
+-----------+-----+
|          1|44224|
|          0|75166|
+-----------+-----+



In [3]:
#Parameters for defining code below
target = 'is_canceled'
cancel_label = 1
noncancel_label = 0

In [4]:
#Balancing a DataFrame with Downsampling

def downsample(df, target, cancel_label, noncancel_label):
    """
    df               spark dataframe
    target           str, target variable
    cancel_label     int, value of canceled booking
    noncancel_label  int, value of non-canceled booking
    
    """

    ### ENTER CODE HERE
    
    from pyspark.sql.functions import col
    
    #count of canceled and non-canceled labels
    cancel_n = df.filter(col(target) == cancel_label).count()
    noncancel_n = df.filter(col(target) == noncancel_label).count()
    
    #df split by having either the poitive or negative labels
    df_cancel = df.filter(col(target) == cancel_label)
    df_noncancel = df.filter(col(target) == noncancel_label)
    
    
    if cancel_n > noncancel_n:
        #amount to sample from is fraction of low noncancel/full cancel
        df_a = df_cancel.sample(fraction = (noncancel_n/cancel_n))
        #combine df_cancel sample with full df_noncancel
        df_b = df_noncancel.union(df_a)
    elif noncancel_n > cancel_n:
        #amount to sample from is fraction of low cancel/full non-cancel
        df_a = df_noncancel.sample(fraction = (cancel_n/noncancel_n))
        #combine df_noncancel sample with full df_cancel
        df_b = df_cancel.union(df_a)
    else:
        #if count of df_cancel = df_noncancel, then just use original df
        df_b = df

    return df_b

In [5]:
# Call your downsample function here, and show the count by label
df_downsample = downsample(df, target, cancel_label, noncancel_label)
df_downsample.groupBy(target).count().show()

+-----------+-----+
|is_canceled|count|
+-----------+-----+
|          1|44224|
|          0|44296|
+-----------+-----+



In [6]:
from pyspark.sql.functions import col

# replace the strings "NULL" and "NA" with null value
df_withNull = df_downsample.replace('NULL', None).replace('NA', None)
#df_withNull = df.replace('NULL', None).replace('NA', None)

# replace null values in 'children' to 0 since there are only 4
df2 = df_withNull.fillna({'children':0})

# replace 'children' datatype to int
df2 = df2.withColumn('children', col('children').cast("Int"))

# drop 'company' and 'agent' due to high null count
df2 = df2.drop('agent', 'company','country', 'arrival_date_week_number')

In [7]:
# addressing reservation_status_date, which gives the date at which the last 
# reservation status was set. I'm transforming it into number of days since reservation
# status was set, so day of arrival - reservation_status_date
# from pyspark.sql.types import DateType

# convert reservation_status_date into datetype dtype
# temp = df2.withColumn("reservation_status_date", df2["reservation_status_date"].cast(DateType()))

# need to combine arrival_date_year, arrival_date_month, and arrival_date_day_of_month
# into one column and cast it to DateType, then replacing reservation_status_date column
# with number of days since reservation.
# until then, dropping reservation_status_date
df2 = df2 .drop('reservation_status_date')

In [8]:
#numerically encode all columns of type string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

# list of columns to numerically encode
col_string=['hotel', 'meal','market_segment',
            'distribution_channel','reserved_room_type','assigned_room_type',
            'deposit_type','customer_type','reservation_status']
col_stringwmonth = col_string+['arrival_date_month']


# col_num = list of new column names being changed into numeric
col_num=[x+"_NUMERIC" for x in col_string]
# don't need to change dates into numeric col_num=col_num+['arrival_date_year','arrival_date_day_of_month']

# col_oh = list of columns being one-hot encoded
col_oh=[x+"_oh" for x in col_string]
# dont need to ohencode dates col_oh=col_oh+['arrival_date_year_oh','arrival_date_day_of_month_oh']

In [9]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(df2) for column in col_stringwmonth]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df2).transform(df2)

In [10]:
#one-hot encode all columns in col_num
ohe = OneHotEncoder(dropLast=False)
ohe.setInputCols(col_num)
ohe.setOutputCols(col_oh)
model = ohe.fit(df_indexed)

df_casted=model.transform(df_indexed)

In [13]:
# drop the original, non-ohencoded variables
df_encoded=df_casted.drop(*col_stringwmonth)
df_encoded=df_encoded.drop(*col_num)

In [14]:
df_encoded.take(1)

[Row(is_canceled=1, lead_time=85, arrival_date_year=2015, arrival_date_day_of_month=1, stays_in_weekend_nights=0, stays_in_week_nights=3, adults=2, children=0, babies=0, is_repeated_guest=0, previous_cancellations=0, previous_bookings_not_canceled=0, booking_changes=0, days_in_waiting_list=0, adr=82.0, required_car_parking_spaces=0, total_of_special_requests=1, arrival_date_month_NUMERIC=1.0, distribution_channel_oh=SparseVector(5, {0: 1.0}), customer_type_oh=SparseVector(4, {0: 1.0}), reservation_status_oh=SparseVector(3, {1: 1.0}), market_segment_oh=SparseVector(8, {0: 1.0}), reserved_room_type_oh=SparseVector(10, {0: 1.0}), assigned_room_type_oh=SparseVector(12, {0: 1.0}), meal_oh=SparseVector(5, {0: 1.0}), hotel_oh=SparseVector(2, {1: 1.0}), deposit_type_oh=SparseVector(3, {0: 1.0}))]

In [16]:
df_encoded.printSchema()

root
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: integer (nullable = true)
 |-- babies: integer (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- booking_changes: integer (nullable = true)
 |-- days_in_waiting_list: integer (nullable = true)
 |-- adr: double (nullable = true)
 |-- required_car_parking_spaces: integer (nullable = true)
 |-- total_of_special_requests: integer (nullable = true)
 |-- arrival_date_month_NUMERIC: double (nullable = false)
 |-- distribution_channel_oh: vector (nullable = true)
 |-- customer_type_oh: vector (nullab

In [17]:
df_encoded.select(['is_canceled', 'lead_time','arrival_date_year','arrival_date_day_of_month']).show(2)

+-----------+---------+-----------------+-------------------------+
|is_canceled|lead_time|arrival_date_year|arrival_date_day_of_month|
+-----------+---------+-----------------+-------------------------+
|          1|       85|             2015|                        1|
|          1|       75|             2015|                        1|
+-----------+---------+-----------------+-------------------------+
only showing top 2 rows



In [18]:
# removed normalization step for now
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import Normalizer

# norm_to_columns = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 
#                 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adr', 
#                 'required_car_parking_spaces', 'total_of_special_requests']

# # combine all to-norm columns into one vector named "norm_features"
# assembler = VectorAssembler(inputCols=norm_to_columns, outputCol="norm_features")
# transformed = assembler.transform(df_encoded) 
# transformed = transformed.drop(*norm_to_columns)

# # in the end, new column named normFeatures combined all features that are normalized
# normalizer = Normalizer(inputCol="norm_features", outputCol="normFeatures", p=1.0)
# l1NormData = normalizer.transform(transformed)
# l1NormData = l1NormData.drop(*norm_to_columns + ['norm_features'])
# print("Normalized using L^1 norm")

In [19]:
# combine all feature columns (non-label columns) into one vector named "feature"
assembler = VectorAssembler(inputCols=df_encoded.columns[1:], outputCol="features")
all_transformed = assembler.transform(df_encoded)

# convert to rdd with 2 columns, label and features, where features is a DenseVector combination of all other features
dataRdd = all_transformed.select(col("is_canceled").alias("label"), col("features")).rdd.map(tuple)

In [20]:
dataRdd.take(1)

[(1,
  SparseVector(69, {0: 85.0, 1: 2015.0, 2: 1.0, 4: 3.0, 5: 2.0, 13: 82.0, 15: 1.0, 16: 1.0, 17: 1.0, 22: 1.0, 27: 1.0, 29: 1.0, 37: 1.0, 47: 1.0, 59: 1.0, 65: 1.0, 66: 1.0}))]

In [21]:
from pyspark.mllib.regression import LabeledPoint

# map features as floats, then input all into a DenseVector, as well as mapping labels as floats
# then map into LabeledPoint object
lp = dataRdd.map(lambda row: (float(row[0]), Vectors.dense([float(c) for c in row[1]])))\
            .map(lambda row: LabeledPoint(row[0], row[1]))
lp.take(2)

[LabeledPoint(1.0, [85.0,2015.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [75.0,2015.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [22]:
# 70/30 test train split
seed = 42
train, test = lp.randomSplit([0.7, 0.3], seed=seed)

In [23]:
train.take(2)

[LabeledPoint(1.0, [85.0,2015.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [75.0,2015.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [24]:
test.take(2)

[LabeledPoint(1.0, [47.0,2015.0,2.0,2.0,5.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,153.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0]),
 LabeledPoint(1.0, [71.0,2015.0,3.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110.3,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [25]:
# use these variables in place of test/train.count() bc those methods take a long time, instead just call once
test_count = test.count()
train_count = train.count()

In [26]:
print("Training Dataset Count: " + str(train_count))
print("Test Dataset Count: " + str(test_count))

Training Dataset Count: 62007
Test Dataset Count: 26513


In [27]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

# train logistic regression model with training data
model = LogisticRegressionWithLBFGS.train(train)

In [28]:
# Evaluating the model on training data
labelsAndPreds_tr = train.map(lambda p: (p.label, model.predict(p.features)))
accuracy_tr = 1.0 * labelsAndPreds_tr.filter(lambda pl: pl[0] == pl[1]).count() / train_count
print('model accuracy (train): {}'.format(accuracy_tr))

model accuracy (train): 1.0


In [29]:
# Evaluating the model on test data
labelsAndPreds_te = test.map(lambda p: (p.label, model.predict(p.features)))
accuracy_te = 1.0 * labelsAndPreds_te.filter(lambda pl: pl[0] == pl[1]).count() / test_count
print('model accuracy (test): {}'.format(accuracy_te))

model accuracy (test): 1.0


In [30]:
# Save notebook as PDF document
!jupyter nbconvert --to pdf `pwd`/modeling.ipynb

[NbConvertApp] Converting notebook /home/jovyan/assignments/facebook.ipynb to pdf


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 53326)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.7/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.7/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.7/socketserver.py", line 720, in __init__
    self.handle()
  File "/opt/conda/lib/python3.7/site-packages/pyspark/accumulators.py", line 268, in handle
    poll(accum_updates)
  File "/opt/conda/lib/python3.7/site-packages/pyspark/accumulators.py", line 241, in poll
    if func():
  File "/opt/conda/lib/python3.7/site-packages/pyspark/accumulators.py", line 245, in accum_updates
    num_updates = read_int(sel

[NbConvertApp] Writing 73122 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 71713 bytes to /home/jovyan/assignments/facebook.pdf
[NbConvertApp] Converting notebook /home/jovyan/assignments/hotel_booking.ipynb to pdf
[NbConvertApp] Support files will be in hotel_booking_files/
[NbConvertApp] Making directory ./hotel_booking_files
[NbConvertApp] Making directory ./hotel_booking_files
[NbConvertApp] Making directory ./hotel_booking_files
[NbConvertApp] Making directory ./hotel_booking_files
[NbConvertApp] Writing 127242 bytes to ./notebook.tex
[NbConvertApp] Building PDF
[NbConvertApp] Running xelatex 3 times: ['xelatex', './notebook.tex', '-quiet']
[NbConvertApp] Running bibtex 1 time: ['bibtex', './notebook']
[NbConvertApp] PDF successfully created
[NbConvertApp] Writing 368945 b