# Task 2: Predicting customer buying behaviour
## 7. Preproduction code

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import cloudpickle

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.preprocessing import QuantileTransformer

from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

## Load the data (original dataset)

In [2]:
df = pd.read_csv("customer_booking.csv",encoding='ISO-8859-1')
df

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2,Internet,RoundTrip,27,6,9,Sat,PERPNH,Australia,1,0,1,5.62,0
49996,1,Internet,RoundTrip,111,6,4,Sun,PERPNH,Australia,0,0,0,5.62,0
49997,1,Internet,RoundTrip,24,6,22,Sat,PERPNH,Australia,0,0,1,5.62,0
49998,1,Internet,RoundTrip,15,6,11,Mon,PERPNH,Australia,1,0,1,5.62,0


## Enumerate final variables from feature selection

In [3]:
variables = ["booking_origin","length_of_stay","purchase_lead","flight_duration","flight_hour","flight_day","num_passengers","sales_channel","wants_extra_baggage","wants_preferred_seat","wants_in_flight_meals","route"]
variables

['booking_origin',
 'length_of_stay',
 'purchase_lead',
 'flight_duration',
 'flight_hour',
 'flight_day',
 'num_passengers',
 'sales_channel',
 'wants_extra_baggage',
 'wants_preferred_seat',
 'wants_in_flight_meals',
 'route']

## Modify datasets before proceeding to building the pipeline

### Remove duplicates

In [4]:
df.drop_duplicates(inplace = True)

### Select final variables from df

In [5]:
x = df[variables].reset_index(drop=True).copy()

target = "booking_complete"
y = df[target].reset_index(drop=True).copy()

## Build pipeline

### Data cleaning summary

#### Build function

In [6]:
def data_cleaning(df):
    # Execute all the steps carried out in step 1 (data cleaning script)
    temp = df
    # Removing outliers from purchase_lead and length_of_stay
    var_clip = ["purchase_lead","length_of_stay"]
    p_min = 0
    p_max = 0.99

    for variable in var_clip:
        temp[variable].clip(lower = temp[variable].quantile(p_min), 
                        upper = temp[variable].quantile(p_max), 
                        inplace = True)
        
    return temp

#### Convert function to transformer

So we can add it to the pipeline

In [7]:
trans_data_cleaning = FunctionTransformer(data_cleaning)

### Feature engineering summary

In [8]:
var_ohe = ["sales_channel","route","booking_origin"]
ohe = OneHotEncoder(sparse_output = False, handle_unknown='ignore')

var_oe = ["flight_day"]
order_flight_day = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
oe = OrdinalEncoder(categories = [order_flight_day],
                    handle_unknown = 'use_encoded_value',
                    unknown_value = 10)

var_qt = ["num_passengers","purchase_lead","length_of_stay","flight_hour","flight_duration"]
qt = QuantileTransformer(output_distribution='normal')

var_mms = ["num_passengers","purchase_lead","length_of_stay","flight_hour"]
mms = MinMaxScaler()

### Build preprocessing pipe

#### Create column transformer

In [9]:
ct = make_column_transformer(
    (ohe, var_ohe),
    (oe, var_oe),
    (qt, var_qt),
    (mms, var_mms),
    remainder='drop')

### Build preprocessing pipeline

In [10]:
pipe_prepro = make_pipeline(trans_data_cleaning, ct)

### Instantiate the model

In [11]:
model = RandomForestClassifier(n_jobs=-1,
                                max_depth=10)

### Create training pipeline

In [12]:
pipe_training = make_pipeline(pipe_prepro, model)

### Save the training pipeline

In [13]:
with open("pipe_training.pickle",mode="wb") as file:
    cloudpickle.dump(pipe_training, file)

### Train the pipeline and create production pipeline

In [14]:
pipe_production = pipe_training.fit(x,y)

## Save execution pipeline

In [15]:
with open("pipe_production.pickle",mode="wb") as file:
    cloudpickle.dump(pipe_production, file)