# Task 2: Predicting customer buying behaviour
## 3. Data transformation

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler

## Load the data

In [2]:
cat = pd.read_pickle("cat_eda.pickle")
num = pd.read_pickle("num_eda.pickle")

## Separate the target variable

In [3]:
target = num[["booking_complete"]].copy().reset_index(drop=True)

## CATEGORICAL VARIABLES TRANSFORMATION

In [4]:
cat.head()

Unnamed: 0,sales_channel,trip_type,flight_day,route,booking_origin
0,Internet,RoundTrip,Sat,OTHERS,New Zealand
1,Internet,RoundTrip,Sat,OTHERS,New Zealand
2,Internet,RoundTrip,Wed,OTHERS,India
3,Internet,RoundTrip,Sat,OTHERS,New Zealand
4,Internet,RoundTrip,Wed,OTHERS,India


### One Hot Encoding

#### Variables to apply OHE

In [5]:
var_ohe = ["sales_channel","trip_type","route","booking_origin"]

#### Instantiate

In [6]:
ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')

#### Train and apply

In [7]:
cat_ohe = ohe.fit_transform(cat[var_ohe])

#### Save as dataframe

In [8]:
cat_ohe = pd.DataFrame(cat_ohe, columns = ohe.get_feature_names_out())
cat_ohe.head()

Unnamed: 0,sales_channel_Internet,sales_channel_Mobile,trip_type_CircleTrip,trip_type_OneWay,trip_type_RoundTrip,route_AKLKUL,route_CGKHND,route_CGKICN,route_CGKKIX,route_CMBMEL,...,booking_origin_Malaysia,booking_origin_New Zealand,booking_origin_OTHERS,booking_origin_Philippines,booking_origin_Singapore,booking_origin_South Korea,booking_origin_Taiwan,booking_origin_Thailand,booking_origin_United States,booking_origin_Vietnam
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Ordinal Encoding

#### Variables to apply OE

In [9]:
var_oe = ["flight_day"]

#### Order of those variables

In [10]:
order_flight_day = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]

#### Instantiate

In [11]:
oe = OrdinalEncoder(categories = [order_flight_day],
                    handle_unknown = 'use_encoded_value',
                    unknown_value = 10)

#### Fit and transform

In [12]:
cat_oe = oe.fit_transform(cat[var_oe]) + 1

#### Save as a dataframe

In [13]:
#Add suffixes to variables' names
names_oe = [variable + '_oe' for variable in var_oe]

#Save as a dataframe
cat_oe = pd.DataFrame(cat_oe, columns = names_oe)
cat_oe.head()

Unnamed: 0,flight_day_oe
0,6.0
1,6.0
2,3.0
3,6.0
4,3.0


## NUMERICAL VARIABLES TRANSFORMATION

### Normalization (Gauss)

In [19]:
var_qt = ["num_passengers","purchase_lead","length_of_stay","flight_hour","flight_duration"]

##### Instantiate

In [20]:
qt = QuantileTransformer(output_distribution='normal')

##### Fit and transform

In [21]:
num_qt = qt.fit_transform(num[var_qt])

##### Save as a dataframe

In [22]:
#Add suffixes to variables' names
names_qt = [variable + '_qt' for variable in var_qt]

#Save as a dataframe
num_qt = pd.DataFrame(num_qt,columns = names_qt)

## COMBINE ALL DATASETS

First let's add the non-transformed data

In [49]:
variables = cat.columns.to_list() + num.iloc[:,:-1].columns.to_list()
var_trans = var_oe + var_ohe + var_qt
var_nt = [name for name in variables if name not in var_trans]
var_nt

['wants_extra_baggage', 'wants_preferred_seat', 'wants_in_flight_meals']

In [50]:
num_nt = num[var_nt]

In [51]:
dataframes = []
dataframes.extend(value for name, value in locals().items() if name.startswith('cat_') or name.startswith('num_'))

### Concatenate all the datasets

In [52]:
df = pd.concat(dataframes, axis = 1)
df.head()

Unnamed: 0,sales_channel_Internet,sales_channel_Mobile,trip_type_CircleTrip,trip_type_OneWay,trip_type_RoundTrip,route_AKLKUL,route_CGKHND,route_CGKICN,route_CGKKIX,route_CMBMEL,...,booking_origin_Vietnam,flight_day_oe,num_passengers_qt,purchase_lead_qt,length_of_stay_qt,flight_hour_qt,flight_duration_qt,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,0.708752,1.505747,0.181652,-0.293986,-1.059682,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,-5.199338,0.653377,0.249721,-0.999986,-1.059682,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.708752,1.384628,0.400635,1.464186,-1.059682,1.0,1.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,-5.199338,0.527858,0.837338,-0.795322,-1.059682,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.708752,0.235509,0.400635,1.116369,-1.059682,1.0,0.0,1.0


## FEATURE SCALING

### Min-Max scaling (Normalization between 0 and 1)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49990 entries, 0 to 49999
Data columns (total 82 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   sales_channel_Internet        49281 non-null  float64
 1   sales_channel_Mobile          49281 non-null  float64
 2   trip_type_CircleTrip          49281 non-null  float64
 3   trip_type_OneWay              49281 non-null  float64
 4   trip_type_RoundTrip           49281 non-null  float64
 5   route_AKLKUL                  49281 non-null  float64
 6   route_CGKHND                  49281 non-null  float64
 7   route_CGKICN                  49281 non-null  float64
 8   route_CGKKIX                  49281 non-null  float64
 9   route_CMBMEL                  49281 non-null  float64
 10  route_CMBSYD                  49281 non-null  float64
 11  route_COKMEL                  49281 non-null  float64
 12  route_COKPER                  49281 non-null  float64
 13  r

In [54]:
var_mms = df.iloc[:,73:79].columns
var_mms

Index(['flight_day_oe', 'num_passengers_qt', 'purchase_lead_qt',
       'length_of_stay_qt', 'flight_hour_qt', 'flight_duration_qt'],
      dtype='object')

#### Instantiate

In [55]:
mms = MinMaxScaler()

#### Fit and transform

In [56]:
df_mms = mms.fit_transform(df[var_mms])

#### Save as a dataframe

In [57]:
#Add suffixes to variables' names
nombres_mms = [variable + '_mms' for variable in var_mms]

#Save as a dataframe
df_mms = pd.DataFrame(df_mms,columns = nombres_mms)

### COMBINE SCALED DATASETS

In [58]:
include = [df, df_mms, target]

In [59]:
df_final = pd.concat(include, axis=1)
df_final.head()

Unnamed: 0,sales_channel_Internet,sales_channel_Mobile,trip_type_CircleTrip,trip_type_OneWay,trip_type_RoundTrip,route_AKLKUL,route_CGKHND,route_CGKICN,route_CGKKIX,route_CMBMEL,...,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_day_oe_mms,num_passengers_qt_mms,purchase_lead_qt_mms,length_of_stay_qt_mms,flight_hour_qt_mms,flight_duration_qt_mms,booking_complete
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.833333,0.568158,0.644802,0.517469,0.471729,0.398095,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.833333,0.0,0.562833,0.524015,0.403835,0.398095,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.333333,0.568158,0.633154,0.538528,0.640805,0.398095,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.833333,0.0,0.550762,0.580524,0.423517,0.398095,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.333333,0.568158,0.522648,0.538528,0.607357,0.398095,0.0


Save the transformed dataframe in .pickle format to proceed to the next step (modeling).

In [60]:
pd.to_pickle(df_final, "df_transformed.pickle")