In [1]:
# import modules
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# build spark session and spark context
spark = SparkSession.builder \
        .master("local[4]") \
        .appName("hotel") \
        .getOrCreate()
sc = spark.sparkContext

df = spark.read.csv('hotel_bookings.csv',  inferSchema=True, header = True)

In [25]:
# load modules
from pyspark.sql import SparkSession
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.feature import VectorAssembler 
from pyspark.mllib.linalg import Vectors
from pyspark.sql.functions import col 
from pyspark.mllib.evaluation import MulticlassMetrics

import os


In [2]:
#Parameters for defining code below
target = 'is_canceled'
cancel_label = 1
noncancel_label = 0

In [3]:
#Balancing a DataFrame with Downsampling

def downsample(df, target, cancel_label, noncancel_label):
    """
    df               spark dataframe
    target           str, target variable
    cancel_label     int, value of canceled booking
    noncancel_label  int, value of non-canceled booking
    
    """

    ### ENTER CODE HERE
    
    from pyspark.sql.functions import col
    
    #count of canceled and non-canceled labels
    cancel_n = df.filter(col(target) == cancel_label).count()
    noncancel_n = df.filter(col(target) == noncancel_label).count()
    
    #df split by having either the poitive or negative labels
    df_cancel = df.filter(col(target) == cancel_label)
    df_noncancel = df.filter(col(target) == noncancel_label)
    
    
    if cancel_n > noncancel_n:
        #amount to sample from is fraction of low noncancel/full cancel
        df_a = df_cancel.sample(fraction = (noncancel_n/cancel_n))
        #combine df_cancel sample with full df_noncancel
        df_b = df_noncancel.union(df_a)
    elif noncancel_n > cancel_n:
        #amount to sample from is fraction of low cancel/full non-cancel
        df_a = df_noncancel.sample(fraction = (cancel_n/noncancel_n))
        #combine df_noncancel sample with full df_cancel
        df_b = df_cancel.union(df_a)
    else:
        #if count of df_cancel = df_noncancel, then just use original df
        df_b = df

    return df_b

In [4]:
# Call your downsample function here, and show the count by label
df_downsample = downsample(df, target, cancel_label, noncancel_label)
df_downsample.groupBy(target).count().show()

+-----------+-----+
|is_canceled|count|
+-----------+-----+
|          1|44224|
|          0|44125|
+-----------+-----+



In [5]:
from pyspark.sql.functions import col

# replace the strings "NULL" and "NA" with null value
df_withNull = df_downsample.replace('NULL', None).replace('NA', None)

# replace null values in 'children' to 0 since there are only 4
df2 = df_withNull.fillna({'children':0})

# replace 'children' datatype to int
df2 = df2.withColumn('children', col('children').cast("Int"))

# drop 'company' and 'agent' due to high null count
df2 = df2.drop('agent', 'company','country', 'arrival_date_week_number', 'reservation_status')

In [6]:
# addressing reservation_status_date, which gives the date at which the last 
# reservation status was set. I'm transforming it into number of days since reservation
# status was set, so day of arrival - reservation_status_date
# from pyspark.sql.types import DateType

# convert reservation_status_date into datetype dtype
# temp = df2.withColumn("reservation_status_date", df2["reservation_status_date"].cast(DateType()))

# need to combine arrival_date_year, arrival_date_month, and arrival_date_day_of_month
# into one column and cast it to DateType, then replacing reservation_status_date column
# with number of days since reservation.
# until then, dropping reservation_status_date
df2 = df2 .drop('reservation_status_date')

In [7]:
#numerically encode all columns of type string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

# list of columns to numerically encode
col_string=['hotel', 'meal','market_segment',
            'distribution_channel','reserved_room_type','assigned_room_type',
            'deposit_type','customer_type']
col_stringwmonth = col_string+['arrival_date_month']


# col_num = list of new column names being changed into numeric
col_num=[x+"_NUMERIC" for x in col_string]
# don't need to change dates into numeric col_num=col_num+['arrival_date_year','arrival_date_day_of_month']

# col_oh = list of columns being one-hot encoded
col_oh=[x+"_oh" for x in col_string]
# dont need to ohencode dates col_oh=col_oh+['arrival_date_year_oh','arrival_date_day_of_month_oh']

In [8]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(df2) for column in col_stringwmonth]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df2).transform(df2)

In [9]:
#one-hot encode all columns in col_num
ohe = OneHotEncoder(dropLast=False)
ohe.setInputCols(col_num)
ohe.setOutputCols(col_oh)
model = ohe.fit(df_indexed)

df_casted=model.transform(df_indexed)

In [10]:
# drop the original, non-ohencoded variables
df_encoded=df_casted.drop(*col_stringwmonth)
df_encoded=df_encoded.drop(*col_num)

In [11]:
#removed normalization step for now
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import Normalizer

norm_to_columns = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 
                'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'adr', 
                'required_car_parking_spaces', 'total_of_special_requests']

# combine all to-norm columns into one vector named "norm_features"
assembler = VectorAssembler(inputCols=norm_to_columns, outputCol="norm_features")
transformed = assembler.transform(df_encoded) 
transformed = transformed.drop(*norm_to_columns)

# in the end, new column named normFeatures combined all features that are normalized
normalizer = Normalizer(inputCol="norm_features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(transformed)
l1NormData = l1NormData.drop(*norm_to_columns + ['norm_features'])
print("Normalized using L^1 norm")

Normalized using L^1 norm


In [12]:
# combine all feature columns (non-label columns) into one vector named "feature"
assembler = VectorAssembler(inputCols=df_encoded.columns[1:], outputCol="features")
all_transformed = assembler.transform(df_encoded)

# convert to rdd with 2 columns, label and features, where features is a DenseVector combination of all other features
dataRdd = all_transformed.select(col("is_canceled").alias("label"), col("features")).rdd.map(tuple)

In [13]:
from pyspark.mllib.regression import LabeledPoint

# map features as floats, then input all into a DenseVector, as well as mapping labels as floats
# then map into LabeledPoint object
lp = dataRdd.map(lambda row: (float(row[0]), Vectors.dense([float(c) for c in row[1]])))\
            .map(lambda row: LabeledPoint(row[0], row[1]))
lp.take(1)

[LabeledPoint(1.0, [85.0,2015.0,1.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [14]:
# 70/30 test train split
seed = 314
train, test = lp.randomSplit([0.7, 0.3], seed=seed)

In [15]:
# use these variables in place of test/train.count() bc those methods take a long time, instead just call once
test_count = test.count()
train_count = train.count()

### Logistic Regression

In [16]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

# train logistic regression model with training data
model = LogisticRegressionWithLBFGS.train(train)

In [17]:
# Evaluating the model on training data
labelsAndPreds_tr = train.map(lambda p: (p.label, model.predict(p.features)))
accuracy_tr = 1.0 * labelsAndPreds_tr.filter(lambda pl: pl[0] == pl[1]).count() / train_count
print('model accuracy (train): {}'.format(accuracy_tr))

model accuracy (train): 0.7672086063253256


In [18]:
# Evaluating the model on test data
labelsAndPreds_te = test.map(lambda p: (p.label, model.predict(p.features)))
accuracy_te = 1.0 * labelsAndPreds_te.filter(lambda pl: pl[0] == pl[1]).count() / test_count
print('model accuracy (test): {}'.format(accuracy_te))

model accuracy (test): 0.7655460918067385


In [None]:
# Part 5: Model Evaluation

In [38]:
# Part 5: Model Evaluation


from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

# Compute raw scores on the test set
predictionAndLabels_te = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics_te = BinaryClassificationMetrics(predictionAndLabels_te)

In [39]:
# Area under precision-recall curve for test data
print("Area under PR for testing = %s" % metrics_te.areaUnderPR)

# Area under ROC curve for test data
print("Area under ROC for testing = %s" % metrics_te.areaUnderROC)

Area under PR for testing = 0.7402845326306933
Area under ROC for testing = 0.7654418356937008


In [40]:
# Calculating the confusion matrix for test data
metrics_matrix_te = MulticlassMetrics(predictionAndLabels_te)
labelsAndPreds_te.take(3)
print("Confusion Matrix:\n{}".format(metrics_matrix_te.confusionMatrix().toArray()))

Confusion Matrix:
[[10496.  2834.]
 [ 3387.  9817.]]


In [42]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.regression import LabeledPoint

# Compute raw scores on the training set
predictionAndLabels_tr = train.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics_tr = BinaryClassificationMetrics(predictionAndLabels_tr)

In [43]:
# Area under precision-recall curve for training data
print("Area under PR for training set = %s" % metrics_tr.areaUnderPR)

# Area under ROC curve for training data
print("Area under ROC for training set = %s" % metrics_tr.areaUnderROC)

Area under PR for training set = 0.7455078105853685
Area under ROC for training set = 0.7672906689680274


In [44]:
# Calculating the confusion matrix for training data
metrics_matrix_tr = MulticlassMetrics(predictionAndLabels_tr)
labelsAndPreds_tr.take(3)
print("Confusion Matrix:\n{}".format(metrics_matrix_tr.confusionMatrix().toArray()))

Confusion Matrix:
[[24323.  6472.]
 [ 7918. 23102.]]
