In [1]:
import numpy as np
import pandas as pd 

In [2]:
data_path = "../data/hotel_bookings.csv"
df = pd.read_csv(data_path)

In [3]:
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory
nan_replacements = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
df = df.fillna(nan_replacements)

In [4]:
# "meal" contains values "Undefined", which is equal to SC.
df["meal"].replace("Undefined", "SC", inplace=True)

In [5]:
# Some rows contain entreis with 0 adults, 0 children and 0 babies. 
# I'm dropping these entries with no guests.
zero_guests = list(df.loc[df["adults"] + df["children"] + df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)

In [6]:
# manually choose columns to include
# some columns are excluded to make the model more general and to prevent leakage
# (arrival_date_year, assigned_room_type, booking_changes, reservation_status, country,
# days_in_waiting_list, hotel)
# including the country would increase accuracy, 
# but it may also make the model less general and not fair

num_features = [
    "lead_time","arrival_date_week_number","arrival_date_day_of_month",
    "stays_in_weekend_nights", "stays_in_week_nights", "total_nights", 
    "adults","children", "babies",
    "is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled",
    "agent", "company",
    "required_car_parking_spaces", "total_of_special_requests", "adr", "adr_pp"
]

cat_features = [
    "arrival_date_month", "meal",
    "market_segment", "distribution_channel", 
    "reserved_room_type", "deposit_type", "customer_type"
]

# Separate features and predicted value
features = num_features + cat_features

In [7]:
# create new features 
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"])
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]

In [8]:
df = df[features + ["is_canceled"]]

In [9]:
df.shape

(119210, 26)

In [11]:
df.to_csv("../data/preprocessed_data.csv", index=False)
