In [5]:
# import modules
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

# build spark session and spark context
spark = SparkSession.builder \
        .master("local[4]") \
        .appName("hotel") \
        .getOrCreate()
sc = spark.sparkContext

df = spark.read.csv('hotel_bookings.csv',  inferSchema=True, header = True)

In [6]:
from pyspark.sql.functions import col

# replace the strings "NULL" and "NA" with null value
df_withNull = df.replace('NULL', None).replace('NA', None)

# replace null values in 'children' to 0 since there are only 4
df2 = df_withNull.fillna({'children':0})

# replace 'children' datatype to int
df2 = df2.withColumn('children', col('children').cast("Int"))

# drop 'company' and 'agent' due to high null count
df2 = df2.drop('agent', 'company','country', 'arrival_date_week_number')

In [7]:
# addressing reservation_status_date, which gives the date at which the last 
# reservation status was set. I'm transforming it into number of days since reservation
# status was set, so day of arrival - reservation_status_date
# from pyspark.sql.types import DateType

# convert reservation_status_date into datetype dtype
# temp = df2.withColumn("reservation_status_date", df2["reservation_status_date"].cast(DateType()))

# need to combine arrival_date_year, arrival_date_month, and arrival_date_day_of_month
# into one column and cast it to DateType, then replacing reservation_status_date column
# with number of days since reservation.
# until then, dropping reservation_status_date
df2 = df2 .drop('reservation_status_date')

In [8]:
#numerically encode all columns of type string
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, OneHotEncoderModel, StringIndexer

# list of columns to numerically encode
col_string=['hotel', 'meal','market_segment',
            'distribution_channel','reserved_room_type','assigned_room_type',
            'deposit_type','customer_type','reservation_status']
col_stringwmonth = col_string+['arrival_date_month']


# col_num = list of new column names being changed into numeric
col_num=[x+"_NUMERIC" for x in col_string]
# don't need to change dates into numeric col_num=col_num+['arrival_date_year','arrival_date_day_of_month']

# col_oh = list of columns being one-hot encoded
col_oh=[x+"_oh" for x in col_string]
# dont need to ohencode dates col_oh=col_oh+['arrival_date_year_oh','arrival_date_day_of_month_oh']

In [9]:
indexers = [StringIndexer(inputCol=column, outputCol=column+"_NUMERIC").fit(df2) for column in col_stringwmonth]
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(df2).transform(df2)

In [10]:
#one-hot encode all columns in col_num
ohe = OneHotEncoder(dropLast=False)
ohe.setInputCols(col_num)
ohe.setOutputCols(col_oh)
model = ohe.fit(df_indexed)

df_casted=model.transform(df_indexed)

In [11]:
# drop the original, non-ohencoded variables
df_encoded=df_casted.drop(*col_stringwmonth)
df_encoded=df_encoded.drop(*col_num)

quick summary to explain below: we start with df_encoded, which is dataframe with all variables numerically encoded, with categorical variables one hot encoded. convert to rdd to so VectorAssembler can combine all features into one vector named "features". then convert back to dataframe to normalize all features, which should only change the continuous variables as normalizing on categorical does nothing. Then, convert the normalized dataframe back to rdd in order to feed it into LabelEncoder. Finally, the "lp" is the labeledpoint object that holds two columns, labels and features. from here use lp for modeling.

In [12]:
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import Vectors

# combine all feature columns (non-label columns) into one vector named "feature"
assembler = VectorAssembler(inputCols=df_encoded.columns[1:], outputCol="features")
transformed = assembler.transform(df_encoded)

# conver to rdd with 2 columns, label and features, where features is a DenseVector combination of all other features
dataRdd = transformed.select(col("is_canceled").alias("label"), col("features")).rdd.map(tuple)

In [13]:
from pyspark.ml.feature import Normalizer

# convert rdd to dataframe again in order to normalize on features column
df_transformed = dataRdd.toDF(['label', 'features'])

normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(df_transformed)
print("Normalized using L^1 norm")
l1NormData.show()

normRdd = l1NormData.select(col("label"), col("normFeatures")).rdd.map(tuple)

Normalized using L^1 norm
+-----+--------------------+--------------------+
|label|            features|        normFeatures|
+-----+--------------------+--------------------+
|    0|(69,[0,1,2,5,11,1...|(69,[0,1,2,5,11,1...|
|    0|(69,[0,1,2,5,11,1...|(69,[0,1,2,5,11,1...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[1,2,4,5,13,1...|(69,[1,2,4,5,13,1...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    1|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    1|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    1|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,6,...|(69,[0,1,2,4,5,6,...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13...|(69,[0,1,2,4,5,13...|
|    0|(69,[0,1,2,4,5,13

In [15]:
from pyspark.mllib.regression import LabeledPoint

# map features as floats, then input all into a DenseVector, as well as mapping labels as floats
# then map into LabeledPoint object
lp = normRdd.map(lambda row: (float(row[0]), Vectors.dense([float(c) for c in row[1]])))\
            .map(lambda row: LabeledPoint(row[0], row[1]))
lp.take(1)
# not sure why the labeledpoint looks like it has so many 0's 
# as the features/normFeatures above looks like it has the correct values. need to investigate further.

[LabeledPoint(0.0, [342.0,2015.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0])]

In [16]:
# 70/30 test train split
seed = 42
train, test = lp.randomSplit([0.7, 0.3], seed)

# count number of rows in train, but it takes a while
# train.count()

83649

In [None]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS

# train logistic regression model with training data
model = LogisticRegressionWithLBFGS.train(train)

# evaluating model on training data
labelsAndPreds = train.map(lambda p:(p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(train.count(()))