# Pre-Processing

## Import dataset and library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/drive/MyDrive/hotel_bookings_data.csv')

In [2]:
pd.set_option('display.max_columns', None)

## Handling Duplicated Value

In [3]:
df.duplicated().sum()

33261

In [4]:
df = df.drop_duplicates()
df.duplicated().sum()

0

## Handling NULL Values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86129 entries, 0 to 119389
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           86129 non-null  object 
 1   is_canceled                     86129 non-null  int64  
 2   lead_time                       86129 non-null  int64  
 3   arrival_date_year               86129 non-null  int64  
 4   arrival_date_month              86129 non-null  object 
 5   arrival_date_week_number        86129 non-null  int64  
 6   arrival_date_day_of_month       86129 non-null  int64  
 7   stays_in_weekend_nights         86129 non-null  int64  
 8   stays_in_weekdays_nights        86129 non-null  int64  
 9   adults                          86129 non-null  int64  
 10  children                        86125 non-null  float64
 11  babies                          86129 non-null  int64  
 12  meal                           

Columns that have null values
- City
- Agent
- Company

In [14]:
df[df.city.isnull()].head()
#City Column, which has a null value, will be ignored because the proportion is small and will be visualized.

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,adults,children,babies,meal,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
30,Resort Hotel,0,118,2017,September,27,1,4,10,1,0.0,0,Breakfast,,Direct,Direct,0,0,0,2,No Deposit,,,0,Personal,62.0,0,2,Check-Out
4127,Resort Hotel,1,0,2018,April,8,15,0,0,0,0.0,0,No Meal,,Offline TA/TO,TA/TO,0,0,0,0,No Deposit,,383.0,0,Personal,0.0,0,0,Canceled
7092,Resort Hotel,1,8,2018,September,30,21,0,1,1,0.0,0,Breakfast,,Corporate,Corporate,0,0,0,0,No Deposit,,204.0,0,Personal,73.0,0,2,Canceled
7860,Resort Hotel,1,39,2018,October,36,30,0,5,2,0.0,0,Dinner,,Direct,Direct,0,0,0,0,No Deposit,,,0,Personal,159.0,0,5,Canceled
8779,Resort Hotel,1,0,2018,December,42,13,0,1,1,0.0,0,Breakfast,,Corporate,Corporate,0,0,0,0,No Deposit,,457.0,0,Personal,50.0,0,0,Canceled


In [10]:
df[df.agent.isnull()].head()
#Agent column, which is null, will be replaced with 0, assuming the hotel booking is not through an agent.

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,adults,children,babies,meal,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,Resort Hotel,0,342,2017,September,27,1,0,0,2,0.0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,3,No Deposit,,,0,Personal,0.0,0,0,Check-Out
1,Resort Hotel,0,737,2017,September,27,1,0,0,2,0.0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,4,No Deposit,,,0,Personal,0.0,0,0,Check-Out
2,Resort Hotel,0,7,2017,September,27,1,0,1,1,0.0,0,Breakfast,Kabupaten Bangka,Direct,Direct,0,0,0,0,No Deposit,,,0,Personal,75.0,0,0,Check-Out
6,Resort Hotel,0,0,2017,September,27,1,0,2,2,0.0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,0,No Deposit,,,0,Personal,107.0,0,0,Check-Out
18,Resort Hotel,0,0,2017,September,27,1,0,1,2,0.0,0,Breakfast,Kota Yogyakarta,Corporate,Corporate,0,0,0,0,No Deposit,,110.0,0,Personal,107.42,0,0,Check-Out


In [11]:
df[df.company.isnull()].head()
#The null Company column will be replaced with 0, assuming the hotel booking is not on behalf of the company.

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,adults,children,babies,meal,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status
0,Resort Hotel,0,342,2017,September,27,1,0,0,2,0.0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,3,No Deposit,,,0,Personal,0.0,0,0,Check-Out
1,Resort Hotel,0,737,2017,September,27,1,0,0,2,0.0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,4,No Deposit,,,0,Personal,0.0,0,0,Check-Out
2,Resort Hotel,0,7,2017,September,27,1,0,1,1,0.0,0,Breakfast,Kabupaten Bangka,Direct,Direct,0,0,0,0,No Deposit,,,0,Personal,75.0,0,0,Check-Out
3,Resort Hotel,0,13,2017,September,27,1,0,1,1,0.0,0,Breakfast,Kabupaten Bangka,Corporate,Corporate,0,0,0,0,No Deposit,304.0,,0,Personal,75.0,0,0,Check-Out
4,Resort Hotel,0,14,2017,September,27,1,0,2,2,0.0,0,Breakfast,Kabupaten Bangka,Online TA,TA/TO,0,0,0,0,No Deposit,240.0,,0,Personal,98.0,0,1,Check-Out


In [15]:
#Fill Null Values

df['agent'] = df['agent'].fillna(0)
df['company'] = df['company'].fillna(0)

## Extract Columns

In [27]:
# Total Guest = Adults + Childrens + Babies
df['total_guest'] = df['adults'] + df['children'] + df['babies']
df.drop(['adults', 'children', 'babies'], axis=1, inplace=True)

## Checking Descriptive Statistics

In [29]:
#Numeric
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,total_guest
count,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86129.0,86125.0
mean,0.275587,79.246224,2018.215467,26.796433,15.806511,1.009753,2.633225,0.039499,0.03057,0.186569,0.271987,81.199166,10.800021,0.645114,106.634109,0.085233,0.705059,2.029806
std,0.446812,85.498774,0.683948,13.67513,8.840436,1.033402,2.058316,0.19478,0.371465,1.744405,0.729529,109.964005,53.484435,9.282699,55.175319,0.28307,0.833525,0.796371
min,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.38,0.0,0.0,0.0
25%,0.0,11.0,2018.0,16.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,72.25,0.0,0.0,2.0
50%,0.0,49.0,2018.0,27.0,16.0,1.0,2.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,98.75,0.0,1.0,2.0
75%,1.0,124.0,2019.0,37.0,23.0,2.0,4.0,0.0,0.0,0.0,0.0,240.0,0.0,0.0,134.51,0.0,1.0,2.0
max,1.0,737.0,2019.0,53.0,31.0,19.0,50.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0,55.0


In [31]:
#Average Daily Rate (ADR) has a negative value which may be an input error, so the value is changed to positive.
df[df['adr'] < 0]

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,meal,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,total_guest
14969,Resort Hotel,0,195,2019,May,10,5,4,6,Breakfast,Kabupaten Bangka,Groups,Direct,1,0,2,2,No Deposit,273.0,0.0,0,Family,-6.38,0,0,Check-Out,2.0


In [34]:
df['adr'] = df.adr.replace(-6.38, 6.38)

In [37]:
#Object
df.describe(include='object')

Unnamed: 0,hotel,arrival_date_month,meal,city,market_segment,distribution_channel,deposit_type,customer_type,reservation_status
count,86129,86129,86129,85679,86129,86129,86129,86129,86129
unique,2,12,5,177,8,5,3,4,3
top,City Hotel,October,Breakfast,Kota Denpasar,Online TA,TA/TO,No Deposit,Personal,Check-Out
freq,52575,11150,67088,26903,51393,68141,85024,71648,62393


In [36]:
categorical = ['hotel','arrival_date_year', 'arrival_date_month', 'meal','city', 'market_segment', 'distribution_channel',
           'deposit_type', 'customer_type', 'reservation_status', 'agent', 'company']

In [38]:
for i in categorical:
  print(df[i].value_counts())
  print(('*')*100)

City Hotel      52575
Resort Hotel    33554
Name: hotel, dtype: int64
****************************************************************************************************
2018    41841
2019    31423
2017    12865
Name: arrival_date_year, dtype: int64
****************************************************************************************************
October      11150
September     9986
July          8242
June          7811
August        7657
May           7435
December      6719
November      6457
April         6035
February      5082
January       4923
March         4632
Name: arrival_date_month, dtype: int64
****************************************************************************************************
Breakfast     67088
No Meal        9442
Dinner         8798
Undefined       454
Full Board      347
Name: meal, dtype: int64
****************************************************************************************************
Kota Denpasar              26903
Kabupaten Bangka     

In [39]:
#Rename columns and value

df.rename(columns = {'meal':'meal_type'}, inplace = True)
df['meal_type'] = df['meal_type'].replace('Undefined', 'Lunch')

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,meal_type,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,total_guest
0,Resort Hotel,0,342,2017,September,27,1,0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,3,No Deposit,0.0,0.0,0,Personal,0.0,0,0,Check-Out,2.0
1,Resort Hotel,0,737,2017,September,27,1,0,0,Breakfast,Kota Denpasar,Direct,Direct,0,0,0,4,No Deposit,0.0,0.0,0,Personal,0.0,0,0,Check-Out,2.0
2,Resort Hotel,0,7,2017,September,27,1,0,1,Breakfast,Kabupaten Bangka,Direct,Direct,0,0,0,0,No Deposit,0.0,0.0,0,Personal,75.0,0,0,Check-Out,1.0
3,Resort Hotel,0,13,2017,September,27,1,0,1,Breakfast,Kabupaten Bangka,Corporate,Corporate,0,0,0,0,No Deposit,304.0,0.0,0,Personal,75.0,0,0,Check-Out,1.0
4,Resort Hotel,0,14,2017,September,27,1,0,2,Breakfast,Kabupaten Bangka,Online TA,TA/TO,0,0,0,0,No Deposit,240.0,0.0,0,Personal,98.0,0,1,Check-Out,2.0


In [41]:
df.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_weekdays_nights,meal_type,city,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,total_guest
92548,City Hotel,0,118,2018,September,27,1,2,5,Breakfast,Kota Jakarta Pusat,Online TA,TA/TO,0,0,0,0,No Deposit,9.0,0.0,0,Family,107.95,0,1,Check-Out,2.0
31312,Resort Hotel,0,71,2018,February,52,21,0,2,Breakfast,Kota Denpasar,Online TA,TA/TO,0,0,0,0,No Deposit,242.0,0.0,0,Personal,37.8,0,1,Check-Out,2.0
78924,City Hotel,0,26,2017,December,42,14,1,4,Breakfast,Kota Yogyakarta,Offline TA/TO,TA/TO,0,0,0,0,No Deposit,28.0,0.0,0,Personal,75.0,0,0,Check-Out,2.0
35208,Resort Hotel,0,134,2019,June,15,13,0,3,Dinner,Kota Denpasar,Offline TA/TO,TA/TO,0,0,0,2,No Deposit,3.0,0.0,0,Family,79.2,0,1,Check-Out,2.0
45152,City Hotel,1,48,2017,December,43,19,1,3,Breakfast,Kota Denpasar,Offline TA/TO,TA/TO,0,0,0,0,No Deposit,13.0,0.0,0,Family,72.36,0,1,Canceled,2.0
59194,City Hotel,1,145,2018,December,43,22,1,1,Dinner,Kabupaten Kepulauan Seribu,Online TA,TA/TO,0,0,0,0,No Deposit,9.0,0.0,0,Personal,153.0,0,0,Canceled,2.0
53763,City Hotel,1,136,2018,August,27,29,0,3,No Meal,Kabupaten Kepulauan Seribu,Online TA,TA/TO,0,0,0,0,No Deposit,9.0,0.0,0,Personal,85.5,0,0,Canceled,2.0
18485,Resort Hotel,0,1,2017,January,47,16,1,4,Breakfast,Kota Surabaya,Direct,Direct,0,0,0,0,No Deposit,0.0,0.0,0,Personal,68.0,0,0,Check-Out,2.0
98549,City Hotel,0,78,2018,November,40,29,0,3,Breakfast,Kabupaten Sleman,Offline TA/TO,TA/TO,0,0,0,0,No Deposit,138.0,0.0,0,Personal,95.0,0,1,Check-Out,2.0
83949,City Hotel,0,4,2018,April,7,13,2,1,Breakfast,Kota Denpasar,Online TA,TA/TO,0,0,0,0,No Deposit,9.0,0.0,0,Personal,91.0,0,0,Check-Out,2.0
