In [62]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [63]:
data = pd.read_csv("hotel_bookings.csv")

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [65]:
#we have already performed EDA on this
#in this dataset let us just focus on Data Preprocessing

In [68]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [71]:
numeric = []
category = []
for col in data.columns:
    if data[col].dtype == "O":
        category.append(col)
    else:
        numeric.append(col)

In [72]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


### Scale Numeric Data Type

We basically scale the Numeric data types i.e., int and float in range of 0-1, using Scling Techniques. We commonly us MinMaxScaler() and StandardScaler to scale numeric features in dataset

In [73]:
#initialize the scale methods that is supported in Sklearn

numeric_feature_scale_std = StandardScaler()
numeric_feature_scale_std_minmax = MinMaxScaler()

### Encoding Category Data Types

Category data types are objects. And Machine models are preferred to be trained with nnumeric value. But we do have Encoding Techniques in Sklearn and let us initialize those methods

In [74]:
#for encoding category data types we basically use LabelEncoder and OneHotEncoder
category_feature_encode_le = LabelEncoder()
category_feature_encode_one = OneHotEncoder()

#### Before Scaling

In [75]:
data[numeric]

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,,,0,0.00,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,,,0,0.00,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,,,0,75.00,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,304.0,,0,75.00,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,240.0,,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,35,30,2,5,2,0.0,0,0,0,0,0,394.0,,0,96.14,0,0
119386,0,102,2017,35,31,2,5,3,0.0,0,0,0,0,0,9.0,,0,225.43,0,2
119387,0,34,2017,35,31,2,5,2,0.0,0,0,0,0,0,9.0,,0,157.71,0,4
119388,0,109,2017,35,31,2,5,2,0.0,0,0,0,0,0,89.0,,0,104.40,0,0


In [76]:
data[numeric] = numeric_feature_scale_std.fit_transform(data[numeric])

#### After Scaling

In [77]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
100818,City Hotel,-0.76704,1.160261,-0.221286,October,1.310895,1.617366,1.073895,-0.786207,-1.478447,...,No Deposit,-0.701368,,-0.131924,Transient,-0.168814,-0.254873,-0.720694,Check-Out,2016-11-02
118337,City Hotel,-0.76704,-0.467997,1.192195,August,0.428871,-0.090908,-0.92889,0.785891,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,1.002638,-0.254873,3.063386,Check-Out,2017-08-19
91042,City Hotel,-0.76704,-0.898456,-0.221286,June,-0.159144,-0.318677,0.072502,-0.262174,-1.478447,...,No Deposit,-0.701368,,-0.131924,Transient,0.1286,-0.254873,0.540666,Check-Out,2016-06-16
54756,City Hotel,1.303712,-0.720658,-0.221286,July,0.281867,1.047942,0.072502,0.261858,1.974242,...,No Deposit,-0.701368,,-0.131924,Transient,1.592915,-0.254873,-0.720694,Canceled,2016-07-11
49563,City Hotel,1.303712,-0.730016,-0.221286,April,-0.820662,-0.090908,-0.92889,-0.262174,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,0.260586,-0.254873,-0.720694,Canceled,2016-03-20
105117,City Hotel,-0.76704,-0.870383,1.192195,January,-1.702685,0.820172,1.073895,1.309924,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,-0.077393,-0.254873,4.324746,Check-Out,2017-01-30
54060,City Hotel,1.303712,1.319344,-0.221286,July,0.061361,-1.115872,1.073895,0.785891,-1.478447,...,Non Refund,-0.611094,,-0.131924,Transient,0.161646,-0.254873,-0.720694,Canceled,2016-02-23
51913,City Hotel,-0.76704,1.983748,-0.221286,May,-0.37965,1.161827,-0.92889,0.261858,0.247897,...,No Deposit,-0.511793,,-0.131924,Transient-Party,0.161646,-0.254873,-0.720694,Check-Out,2016-05-29
38672,Resort Hotel,-0.76704,-0.299557,1.192195,July,0.134863,0.364632,1.073895,1.309924,0.247897,...,No Deposit,1.474232,,-0.131924,Transient,4.040102,3.821932,0.540666,Check-Out,2017-07-26
105023,City Hotel,-0.76704,-0.935887,1.192195,January,-1.702685,1.275712,-0.92889,-0.786207,-1.478447,...,No Deposit,-0.719423,,-0.131924,Transient,-0.521833,-0.254873,-0.720694,Check-Out,2017-01-28


#### Before Encoding

In [78]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


In [79]:
for i in category:
    data[i] = category_feature_encode_le.fit_transform(data[i].astype(str))

#### After Encoding

In [80]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,1,5,0,135,3,1,2,2,0,2,1,121
1,1,5,0,135,3,1,2,2,0,2,1,121
2,1,5,0,59,3,1,0,2,0,2,1,122
3,1,5,0,59,2,0,0,0,0,2,1,122
4,1,5,0,59,6,3,0,0,0,2,1,123
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,1,0,15,5,3,0,0,0,2,1,919
119386,0,1,0,56,6,3,4,4,0,2,1,920
119387,0,1,0,43,6,3,3,3,0,2,1,920
119388,0,1,0,59,6,3,0,0,0,2,1,920


### Updated Dataset

In [81]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
87538,0,-0.76704,-0.92653,-0.221286,0,-0.74716,0.592402,-0.92889,-0.786207,1.974242,...,0,,,-0.131924,2,0.972956,-0.254873,-0.720694,1,417
103479,0,-0.76704,0.870169,-0.221286,2,1.825408,0.592402,0.072502,0.785891,0.247897,...,0,-0.755533,,-0.131924,3,0.228925,-0.254873,-0.720694,1,665
79014,0,-0.76704,-0.954603,-0.221286,10,1.016887,-1.115872,-0.92889,-0.786207,-1.478447,...,0,,-1.095874,-0.131924,2,-2.015038,-0.254873,3.063386,1,585
47059,0,1.303712,-0.851667,-0.221286,3,-1.555681,-1.229757,1.073895,0.785891,0.247897,...,0,-0.701368,,-0.131924,2,-0.451783,-0.254873,-0.720694,0,329
42984,0,1.303712,-0.7113,-1.634768,11,0.722879,-0.432562,1.073895,-0.786207,0.247897,...,0,-0.701368,,-0.131924,2,0.300162,-0.254873,0.540666,0,168
51397,0,1.303712,-0.730016,-0.221286,8,-0.453152,0.364632,-0.92889,0.261858,0.247897,...,0,-0.701368,,-0.131924,2,2.358118,-0.254873,-0.720694,0,429
44449,0,-0.76704,0.065398,-1.634768,10,1.016887,-1.229757,0.072502,0.261858,0.247897,...,0,-0.538875,,-0.131924,2,-0.585352,-0.254873,-0.720694,1,221
101831,0,-0.76704,-0.62708,-0.221286,9,1.457898,0.136862,-0.92889,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,0.042918,-0.254873,1.802026,1,628
93045,0,-0.76704,-0.112401,-0.221286,5,0.134863,-0.432562,-0.92889,0.785891,0.247897,...,0,-0.701368,,-0.131924,2,-0.237083,-0.254873,1.802026,1,502
86993,0,-0.76704,-0.954603,-0.221286,8,-0.453152,-0.090908,1.073895,-0.786207,-1.478447,...,0,,-1.133855,-0.131924,3,-0.728816,-0.254873,-0.720694,1,443


In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int64  
 1   is_canceled                     119390 non-null  float64
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  float64
 4   arrival_date_month              119390 non-null  int64  
 5   arrival_date_week_number        119390 non-null  float64
 6   arrival_date_day_of_month       119390 non-null  float64
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            

As you can see there is no object data type now

### Handling Missing Value

There are multiple ways using which we can handle Missing Data, we have already seen dropna and fillna Pandas method. But here we shall deal Missing Data using Imputer Technique. In this technique, the missing data is filled up or imputed by a suitable substitute and there are multiple strategies behind it. 

- Replace with Mean
- Replace with Median
- Replace with Constant
- Replace with Most frequent

In [83]:
#let us copy the dataset

work_data = data.copy()

In [84]:
imputer = SimpleImputer(strategy="mean")
imputer.fit_transform(work_data)

array([[ 1.00000000e+00, -7.67040492e-01,  2.22705112e+00, ...,
        -7.20694110e-01,  1.00000000e+00,  1.21000000e+02],
       [ 1.00000000e+00, -7.67040492e-01,  5.92338470e+00, ...,
        -7.20694110e-01,  1.00000000e+00,  1.21000000e+02],
       [ 1.00000000e+00, -7.67040492e-01, -9.07814066e-01, ...,
        -7.20694110e-01,  1.00000000e+00,  1.22000000e+02],
       ...,
       [ 0.00000000e+00, -7.67040492e-01, -6.55153290e-01, ...,
         4.32474574e+00,  1.00000000e+00,  9.20000000e+02],
       [ 0.00000000e+00, -7.67040492e-01,  4.66822005e-02, ...,
        -7.20694110e-01,  1.00000000e+00,  9.20000000e+02],
       [ 0.00000000e+00, -7.67040492e-01,  9.45031628e-01, ...,
         1.80202581e+00,  1.00000000e+00,  9.20000000e+02]])

In [86]:
work_data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
41670,0,1.303712,0.027967,-1.634768,1,0.502373,0.592402,0.072502,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,-0.501253,-0.254873,0.540666,0,120
50130,0,1.303712,0.392921,-0.221286,0,-0.673658,0.934057,1.073895,-1.31024,0.247897,...,1,-0.547903,,2.70985,2,-0.135174,-0.254873,-0.720694,0,323
81170,0,-0.76704,-0.917172,-1.634768,2,1.604902,-1.457527,-0.92889,-0.262174,0.247897,...,0,0.580521,,-0.131924,3,-0.926696,-0.254873,-0.720694,1,278
166,1,-0.76704,-0.496071,-1.634768,5,0.061361,-1.115872,0.072502,0.261858,1.974242,...,0,1.392986,,-0.131924,2,0.316191,-0.254873,1.802026,1,130
96484,0,-0.76704,-0.552217,-0.221286,11,0.649377,-1.685297,-0.92889,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,0.335781,-0.254873,1.802026,1,551
14197,1,-0.76704,-0.879741,-0.221286,4,-1.776187,-0.204793,-0.92889,-0.786207,-1.478447,...,0,,-0.579335,-0.131924,2,-1.421397,-0.254873,-0.720694,1,319
102552,0,-0.76704,0.346132,-0.221286,9,1.604902,1.275712,1.073895,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,-0.333055,-0.254873,-0.720694,1,640
69807,0,1.303712,-0.599006,1.192195,6,-0.306148,-1.001987,-0.92889,-0.262174,-1.478447,...,0,-0.692341,,-0.131924,2,-0.135174,-0.254873,-0.720694,0,821
91864,0,-0.76704,1.5065,-0.221286,6,-0.085642,0.934057,-0.92889,-0.262174,-1.478447,...,0,-0.72845,,-0.131924,3,-0.234115,-0.254873,-0.720694,1,482
3058,1,-0.76704,-0.664511,-1.634768,9,1.384396,-0.432562,-0.92889,0.261858,-1.478447,...,0,2.232533,0.696821,-0.131924,3,-1.223517,-0.254873,-0.720694,1,258


In [89]:
most_freq_imp = SimpleImputer(strategy="most_frequent")
most_freq_imp.fit_transform(work_data)

array([[ 1.00000000e+00, -7.67040492e-01,  2.22705112e+00, ...,
        -7.20694110e-01,  1.00000000e+00,  1.21000000e+02],
       [ 1.00000000e+00, -7.67040492e-01,  5.92338470e+00, ...,
        -7.20694110e-01,  1.00000000e+00,  1.21000000e+02],
       [ 1.00000000e+00, -7.67040492e-01, -9.07814066e-01, ...,
        -7.20694110e-01,  1.00000000e+00,  1.22000000e+02],
       ...,
       [ 0.00000000e+00, -7.67040492e-01, -6.55153290e-01, ...,
         4.32474574e+00,  1.00000000e+00,  9.20000000e+02],
       [ 0.00000000e+00, -7.67040492e-01,  4.66822005e-02, ...,
        -7.20694110e-01,  1.00000000e+00,  9.20000000e+02],
       [ 0.00000000e+00, -7.67040492e-01,  9.45031628e-01, ...,
         1.80202581e+00,  1.00000000e+00,  9.20000000e+02]])

#### We shall use MinMax and One Hot Encoder in different examples.

### Summary:

Before we train and predict our Machine Learning Models, here are the few steps you need to follow:
- EDA

#### Data Preprocess

- Scaling Numeric Data Type
- Encoding Category Data Type
- Handle Missing Data