# Task 2: Predicting customer buying behaviour
## 7. Preproduction code

## Import libraries

In [58]:
import numpy as np
import pandas as pd
import cloudpickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder

from sklearn.preprocessing import QuantileTransformer

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

## Load the data (original dataset)

In [59]:
df = pd.read_csv("customer_booking.csv",encoding='ISO-8859-1')
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,5.62,0
49996,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,5.62,0
49997,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,5.62,0
49998,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,5.62,0


## Enumerate final variables from feature selection

In [60]:
variables = ["booking_origin","length_of_stay","purchase_lead","flight_duration","flight_hour","flight_day","num_passengers","sales_channel","wants_extra_baggage","wants_preferred_seat","wants_in_flight_meals","route"]
variables

['booking_origin',
 'length_of_stay',
 'purchase_lead',
 'flight_duration',
 'flight_hour',
 'flight_day',
 'num_passengers',
 'sales_channel',
 'wants_extra_baggage',
 'wants_preferred_seat',
 'wants_in_flight_meals',
 'route']

## Modify datasets before proceeding to building the pipeline

### Remove duplicates

In [61]:
df.drop_duplicates(inplace = True)

### Select final variables from df

In [62]:
x = df[variables].reset_index(drop=True).copy()

target = "booking_complete"
y = df[target].reset_index(drop=True).copy()

## Build pipeline

### Data cleaning summary

#### Build function

In [63]:
def data_cleaning(df):
    # Execute all the steps carried out in step 1 (data cleaning script)
    temp = df
    # Removing outliers from purchase_lead and length_of_stay
    var_clip = ["purchase_lead","length_of_stay"]
    p_min = 0
    p_max = 0.99

    for variable in var_clip:
        temp[variable].clip(lower = temp[variable].quantile(p_min), 
                        upper = temp[variable].quantile(p_max), 
                        inplace = True)
        
    return temp

#### Convert function to transformer

So we can add it to the pipeline

In [64]:
trans_data_cleaning = FunctionTransformer(data_cleaning(df))

### Feature engineering summary

In [65]:
var_ohe = ["sales_channel","trip_type","route","booking_origin"]
ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')

var_oe = ["flight_day"]
order_flight_day = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
oe = OrdinalEncoder(categories = [order_flight_day],
                    handle_unknown = 'use_encoded_value',
                    unknown_value = 10)

var_qt = ["num_passengers","purchase_lead","length_of_stay","flight_hour","flight_duration"]
qt = QuantileTransformer(output_distribution='normal')

var_mms = ["flight_day","num_passengers","purchase_lead","length_of_stay","flight_hour"]
mms = MinMaxScaler()

### Build preprocessing pipe

#### Create column transformer

In [66]:
ct = make_column_transformer(
    (ohe, var_ohe),
    (oe, var_oe),
    (qt, var_qt),
    (mms, var_mms),
    remainder='passthrough')

### Build preprocessing pipeline

In [67]:
pipe_prepro = make_pipeline(trans_data_cleaning, ct)

### Instantiate the model

In [68]:
model = RandomForestClassifier(n_jobs=-1,
                                max_depth=10)

### Create training pipeline

In [69]:
pipe_training = make_pipeline(pipe_prepro, model)

### Save the training pipeline

In [70]:
with open("pipe_training.pickle",mode="wb") as file:
    cloudpickle.dump(pipe_training, file)

### Train the pipeline and create production pipeline

In [71]:
pipe_production = pipe_training.fit(x,y)

InvalidParameterError: The 'func' parameter of FunctionTransformer must be a callable or None. Got        num_passengers sales_channel  trip_type  purchase_lead  length_of_stay  \
0                   2      Internet  RoundTrip            262              19   
1                   1      Internet  RoundTrip            112              20   
2                   2      Internet  RoundTrip            243              22   
3                   1      Internet  RoundTrip             96              31   
4                   2      Internet  RoundTrip             68              22   
...               ...           ...        ...            ...             ...   
49995               2      Internet  RoundTrip             27               6   
49996               1      Internet  RoundTrip            111               6   
49997               1      Internet  RoundTrip             24               6   
49998               1      Internet  RoundTrip             15               6   
49999               1      Internet  RoundTrip             19               6   

       flight_hour flight_day   route booking_origin  wants_extra_baggage  \
0                7        Sat  AKLDEL    New Zealand                    1   
1                3        Sat  AKLDEL    New Zealand                    0   
2               17        Wed  AKLDEL          India                    1   
3                4        Sat  AKLDEL    New Zealand                    0   
4               15        Wed  AKLDEL          India                    1   
...            ...        ...     ...            ...                  ...   
49995            9        Sat  PERPNH      Australia                    1   
49996            4        Sun  PERPNH      Australia                    0   
49997           22        Sat  PERPNH      Australia                    0   
49998           11        Mon  PERPNH      Australia                    1   
49999           10        Thu  PERPNH      Australia                    0   

       wants_preferred_seat  wants_in_flight_meals  flight_duration  \
0                         0                      0             5.52   
1                         0                      0             5.52   
2                         1                      0             5.52   
3                         0                      1             5.52   
4                         0                      1             5.52   
...                     ...                    ...              ...   
49995                     0                      1             5.62   
49996                     0                      0             5.62   
49997                     0                      1             5.62   
49998                     0                      1             5.62   
49999                     1                      0             5.62   

       booking_complete  
0                     0  
1                     0  
2                     0  
3                     0  
4                     0  
...                 ...  
49995                 0  
49996                 0  
49997                 0  
49998                 0  
49999                 0  

[49281 rows x 14 columns] instead.

In [None]:
x

Unnamed: 0,booking_origin,length_of_stay,purchase_lead,flight_duration,flight_hour,flight_day,num_passengers,sales_channel,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,route
0,New Zealand,19,262,5.52,7,Sat,2,Internet,1,0,0,AKLDEL
1,New Zealand,20,112,5.52,3,Sat,1,Internet,0,0,0,AKLDEL
2,India,22,243,5.52,17,Wed,2,Internet,1,1,0,AKLDEL
3,New Zealand,31,96,5.52,4,Sat,1,Internet,0,0,1,AKLDEL
4,India,22,68,5.52,15,Wed,2,Internet,1,0,1,AKLDEL
...,...,...,...,...,...,...,...,...,...,...,...,...
49276,Australia,6,27,5.62,9,Sat,2,Internet,1,0,1,PERPNH
49277,Australia,6,111,5.62,4,Sun,1,Internet,0,0,0,PERPNH
49278,Australia,6,24,5.62,22,Sat,1,Internet,0,0,1,PERPNH
49279,Australia,6,15,5.62,11,Mon,1,Internet,1,0,1,PERPNH
