## Predeployment Pipeline

#### We have tested our model during the development phase. 
#### Now we retrain it on the entire dataset and save it for deployment.  

In [64]:
import warnings
warnings.filterwarnings('ignore')

In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [67]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, Binarizer

In [68]:
classifier = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [69]:
def processing_pipeline(df, target_name, numeric_features, categorical_features, classifier):

    y = df[target_name]
    X = df.drop(target_name, axis=1)
    
    #numeric_features and categorical_features must be lists

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    #binary_transformer = Pipeline(steps=[
    #    ('imputer', SimpleImputer(strategy='most_frequent')),
    #    ('binary', Binarizer(threshold=0.5))])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    #    ('bin', binary_transformer, bin_features)
    ])
    
    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', classifier)])
    
    pipe.fit(X, y)

    return pipe

In [70]:
df = pd.read_csv('df.csv')

In [71]:
df = df[['recency', 'history', 'used_discount', 'used_bogo', 'is_referral', 'channel', 'offer', 'conversion']]

In [72]:
numeric_features = list(df.select_dtypes(include=['int', 'float']).columns)
categorical_features = list(df.select_dtypes(exclude=['int', 'float']).columns)

In [73]:
numeric_features.remove('conversion')

In [74]:
pipelineModel = processing_pipeline(df, 'conversion', numeric_features, categorical_features, classifier)

#### Unit Test 1: Testing Model Portability

In [75]:
import pickle

In [76]:
# save the model to disk
filename = 'final_model.sav'
pickle.dump(pipelineModel, open(filename, 'wb'))

In [77]:
loaded_model = pickle.load(open(filename, 'rb'))

#### Unit Test 2: Testing Model Predictions

In [95]:
x_pred = X.loc[:0]

In [97]:
loaded_model.predict(x_pred)[0]

0

In [81]:
y = df['conversion']
X = df.drop('conversion', axis=1)

In [82]:
X.loc[1]

recency                 6
history            329.08
used_discount           1
used_bogo               1
is_referral             1
channel               Web
offer            No Offer
Name: 1, dtype: object

In [84]:
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,channel,offer,conversion
0,10,142.44,1,0,0,Phone,Buy One Get One,0
1,6,329.08,1,1,1,Web,No Offer,0
2,7,180.65,0,1,1,Web,Buy One Get One,0
3,9,675.83,1,0,1,Web,Discount,0
4,2,45.34,1,0,0,Web,Buy One Get One,0


In [101]:
def predict(data):
    df = pd.DataFrame(data, index=[0])
    df.columns = ['recency', 'history', 'used_discount', 'used_bogo', 
                'is_referral', 'channel', 'offer']
    
    return {'result': loaded_model.predict(df)[0]}

In [102]:
data = {
  "recency": "6",
  "history": "329.08",
  "used_discount": "1",
  "used_bogo": "1",
  "is_referral": "1",
  "channel": "Web",
  "offer": "No Offer"
}

In [103]:
predict(data)

{'result': 0}

In [104]:
categorical_features

['channel', 'offer']

In [106]:
df = pd.DataFrame(data, index=[0])
df.columns = ['recency', 'history', 'used_discount', 'used_bogo', 
                'is_referral', 'channel', 'offer']

In [110]:
df['recency'].astype(float)

0    6.0
Name: recency, dtype: float64