In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [18]:
data = pd.read_csv("hotel_bookings.csv")

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [20]:
#we have already performed EDA on this
#in this dataset let us just focus on Data Preprocessing

In [21]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

### Handling Missing Value

There are multiple ways using which we can handle Missing Data, we have already seen dropna and fillna Pandas method. But here we shall deal Missing Data using Imputer Technique. In this technique, the missing data is filled up or imputed by a suitable substitute and there are multiple strategies behind it. 

- Replace with Mean
- Replace with Median
- Replace with Constant
- Replace with Most frequent

In [22]:
imputer = SimpleImputer(strategy="most_frequent")
X_imputer = imputer.fit_transform(data)

In [23]:
numeric = []
category = []
for col in data.columns:
    if data[col].dtype == "O":
        category.append(col)
    else:
        numeric.append(col)

In [24]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


[![scale](https://i.stack.imgur.com/Z7ATR.png)](https://stackoverflow.com/questions/40758562/can-anyone-explain-me-standardscaler#40767144)

### Scale Numeric Data Type

We basically scale the Numeric data types i.e., int and float in range of 0-1, using Scling Techniques. We commonly us MinMaxScaler() and StandardScaler to scale numeric features in dataset

In [25]:
#initialize the scale methods that is supported in Sklearn

numeric_feature_scale_std = StandardScaler()
numeric_feature_scale_std_minmax = MinMaxScaler()

### Encoding Category Data Types

Category data types are objects. And Machine models are preferred to be trained with nnumeric value. But we do have Encoding Techniques in Sklearn and let us initialize those methods

In [26]:
#for encoding category data types we basically use LabelEncoder and OneHotEncoder
category_feature_encode_le = LabelEncoder()
category_feature_encode_one = OneHotEncoder()

#### Before Scaling

In [27]:
data[numeric]

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
0,0,342,2015,27,1,0,0,2,0.0,0,0,0,0,3,,,0,0.00,0,0
1,0,737,2015,27,1,0,0,2,0.0,0,0,0,0,4,,,0,0.00,0,0
2,0,7,2015,27,1,0,1,1,0.0,0,0,0,0,0,,,0,75.00,0,0
3,0,13,2015,27,1,0,1,1,0.0,0,0,0,0,0,304.0,,0,75.00,0,0
4,0,14,2015,27,1,0,2,2,0.0,0,0,0,0,0,240.0,,0,98.00,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,23,2017,35,30,2,5,2,0.0,0,0,0,0,0,394.0,,0,96.14,0,0
119386,0,102,2017,35,31,2,5,3,0.0,0,0,0,0,0,9.0,,0,225.43,0,2
119387,0,34,2017,35,31,2,5,2,0.0,0,0,0,0,0,9.0,,0,157.71,0,4
119388,0,109,2017,35,31,2,5,2,0.0,0,0,0,0,0,89.0,,0,104.40,0,0


Standard Scaler helps to get standardized distribution, with a zero mean and standard deviation of one (unit variance). It standardizes features by subtracting the mean value from the feature and then dividing the result by feature standard deviation.

In [28]:
data[numeric] = numeric_feature_scale_std.fit_transform(data[numeric])

#### After Scaling

In [29]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
108619,City Hotel,-0.76704,-0.973319,1.192195,March,-1.11467,0.934057,1.073895,0.785891,0.247897,...,No Deposit,0.61663,,-0.131924,Transient,0.161646,-0.254873,0.540666,Check-Out,2017-03-30
2992,Resort Hotel,1.303712,-0.608364,-1.634768,November,1.384396,-0.660332,-0.92889,1.309924,-1.478447,...,No Deposit,1.754081,,-0.131924,Transient-Party,-1.223517,-0.254873,-0.720694,Canceled,2015-10-10
79006,City Hotel,1.303712,-0.935887,-1.634768,December,1.604902,-1.229757,-0.92889,-0.786207,0.247897,...,No Deposit,,-1.095874,-0.131924,Transient,-2.015038,-0.254873,1.802026,Canceled,2015-12-04
112598,City Hotel,-0.76704,-0.879741,1.192195,May,-0.453152,1.047942,-0.92889,-0.786207,0.247897,...,No Deposit,-0.701368,,-0.131924,Transient,1.151048,-0.254873,1.802026,Check-Out,2017-05-26
103433,City Hotel,-0.76704,-0.32763,-0.221286,December,1.825408,0.706287,-0.92889,-0.262174,1.974242,...,No Deposit,-0.584012,,-0.131924,Transient,-0.181676,-0.254873,-0.720694,Check-Out,2016-12-24
42593,City Hotel,-0.76704,-0.945245,-1.634768,September,0.722879,-1.001987,0.072502,0.785891,0.247897,...,No Deposit,-0.656231,,-0.131924,Transient,0.513873,-0.254873,-0.720694,Check-Out,2015-09-12
19022,Resort Hotel,-0.76704,-0.804878,-1.634768,December,1.604902,-1.229757,0.072502,-0.786207,1.974242,...,No Deposit,1.474232,,-0.131924,Transient,-0.174751,-0.254873,-0.720694,Check-Out,2015-12-07
9369,Resort Hotel,1.303712,-0.561575,-0.221286,November,1.457898,0.364632,0.072502,-0.786207,0.247897,...,No Deposit,1.383958,,-0.131924,Transient,-1.26705,-0.254873,-0.720694,Canceled,2016-10-07
68918,City Hotel,1.303712,1.178977,1.192195,May,-0.453152,0.706287,0.072502,1.309924,1.974242,...,No Deposit,-0.701368,,-0.131924,Transient,1.270766,-0.254873,1.802026,Canceled,2016-10-31
69795,City Hotel,1.303712,-0.945245,1.192195,June,-0.306148,-1.115872,1.073895,1.833957,-1.478447,...,No Deposit,-0.656231,,-0.131924,Transient,3.127477,-0.254873,-0.720694,Canceled,2017-06-06


#### Before Encoding

In [30]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
1,Resort Hotel,July,BB,PRT,Direct,Direct,C,C,No Deposit,Transient,Check-Out,2015-07-01
2,Resort Hotel,July,BB,GBR,Direct,Direct,A,C,No Deposit,Transient,Check-Out,2015-07-02
3,Resort Hotel,July,BB,GBR,Corporate,Corporate,A,A,No Deposit,Transient,Check-Out,2015-07-02
4,Resort Hotel,July,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2015-07-03
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,August,BB,BEL,Offline TA/TO,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-06
119386,City Hotel,August,BB,FRA,Online TA,TA/TO,E,E,No Deposit,Transient,Check-Out,2017-09-07
119387,City Hotel,August,BB,DEU,Online TA,TA/TO,D,D,No Deposit,Transient,Check-Out,2017-09-07
119388,City Hotel,August,BB,GBR,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,2017-09-07


In [31]:
for i in category:
    data[i] = category_feature_encode_le.fit_transform(data[i].astype(str))

#### After Encoding

In [32]:
data[category]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,1,5,0,135,3,1,2,2,0,2,1,121
1,1,5,0,135,3,1,2,2,0,2,1,121
2,1,5,0,59,3,1,0,2,0,2,1,122
3,1,5,0,59,2,0,0,0,0,2,1,122
4,1,5,0,59,6,3,0,0,0,2,1,123
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,0,1,0,15,5,3,0,0,0,2,1,919
119386,0,1,0,56,6,3,4,4,0,2,1,920
119387,0,1,0,43,6,3,3,3,0,2,1,920
119388,0,1,0,59,6,3,0,0,0,2,1,920


### Updated Dataset

In [33]:
data.sample(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
4520,1,-0.76704,0.336774,-0.221286,7,-1.041168,0.706287,1.073895,1.309924,0.247897,...,0,1.474232,,-0.131924,2,-0.972604,3.821932,-0.720694,1,393
27083,1,-0.76704,1.010536,-0.221286,1,0.428871,-0.318677,1.073895,1.309924,0.247897,...,0,1.501314,,-0.131924,2,0.215074,-0.254873,0.540666,1,537
85853,0,-0.76704,-0.973319,-0.221286,7,-1.041168,0.706287,-0.92889,-0.786207,0.247897,...,0,,,-0.131924,2,-0.056022,-0.254873,0.540666,1,387
55991,0,1.303712,-0.093685,-0.221286,1,0.575875,1.161827,0.072502,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,0.58511,-0.254873,0.540666,2,543
65735,0,1.303712,0.804665,1.192195,0,-0.967666,-1.001987,-0.92889,-0.262174,-1.478447,...,0,-0.701368,,-0.131924,2,0.157688,-0.254873,0.540666,0,662
68652,0,1.303712,0.074756,1.192195,8,-0.526654,0.364632,0.072502,-0.262174,0.247897,...,0,-0.701368,,-0.131924,2,0.478255,-0.254873,0.540666,0,717
8037,1,1.303712,1.001178,-0.221286,11,0.722879,-1.001987,-0.92889,0.261858,-1.478447,...,1,-0.773587,,-0.131924,2,-0.253903,-0.254873,-0.720694,0,344
69124,0,1.303712,0.870169,1.192195,8,-0.453152,1.047942,1.073895,0.785891,0.247897,...,0,-0.656231,,-0.131924,2,0.549492,-0.254873,0.540666,0,815
3435,1,1.303712,-0.215336,-1.634768,2,1.678404,-1.115872,1.073895,-0.786207,0.247897,...,0,1.383958,,-0.131924,2,-1.195813,-0.254873,1.802026,0,260
96931,0,-0.76704,-0.177905,-0.221286,11,0.722879,-0.888102,-0.92889,0.261858,0.247897,...,0,-0.701368,,-0.131924,2,0.941295,-0.254873,3.063386,1,559


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  int64  
 1   is_canceled                     119390 non-null  float64
 2   lead_time                       119390 non-null  float64
 3   arrival_date_year               119390 non-null  float64
 4   arrival_date_month              119390 non-null  int64  
 5   arrival_date_week_number        119390 non-null  float64
 6   arrival_date_day_of_month       119390 non-null  float64
 7   stays_in_weekend_nights         119390 non-null  float64
 8   stays_in_week_nights            119390 non-null  float64
 9   adults                          119390 non-null  float64
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  float64
 12  meal            

As you can see there is no object data type now

#### We shall use MinMax and One Hot Encoder in different examples.

In part two we shall see One Hot Encoder and MinMax along with Feature Selection and Data Splitting. 

### Summary:

Before we train and predict our Machine Learning Models, here are the few steps you need to follow:
- EDA

#### Data Preprocess

- Handle Missing Data
- Scaling Numeric Data Type
- Encoding Category Data Type