## Predeployment Pipeline

#### We have tested our model during the development phase. 
#### Now we retrain it on the entire dataset and save it for deployment.  

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import time

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [37]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder, Binarizer

In [38]:
clf = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
def processing_pipeline(df, target_name, numeric_features, categorical_features, classifier):

    y = df[target_name]
    X = df.drop(target_name, axis=1)
    
    #numeric_features and categorical_features must be lists

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    
    #binary_transformer = Pipeline(steps=[
    #    ('imputer', SimpleImputer(strategy='most_frequent')),
    #    ('binary', Binarizer(threshold=0.5))])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    #    ('bin', binary_transformer, bin_features)
    ])
    
    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('clf', classifier)])
    
    pipe.fit(X, y)

    return pipe

In [40]:
df = pd.read_csv('../data/data.csv')

In [41]:
df = df[['recency', 'history', 'used_discount', 'used_bogo', 'is_referral', 'channel', 'offer', 'conversion', 'score']]

In [42]:
numeric_features = list(df.select_dtypes(include=['int', 'float']).columns)
categorical_features = list(df.select_dtypes(exclude=['int', 'float']).columns)

In [43]:
numeric_features.remove('conversion')

In [44]:
pipelineModel = processing_pipeline(df, 'conversion', numeric_features, categorical_features, clf)

#### Unit Test 1: Testing Model Portability

In [45]:
import pickle

In [46]:
now = time.time()
now

1672862166.01983

In [47]:
# save the model to disk
filename = 'models/final_model_{}.sav'.format(round(now))

In [48]:
pickle.dump(pipelineModel, open(filename, 'wb'))

In [49]:
loaded_model = pickle.load(open(filename, 'rb'))

#### Unit Test 2: Testing Model Predictions

In [50]:
x_pred = df.loc[:0]

In [51]:
loaded_model.predict(x_pred)[0]

0

In [52]:
y = df['conversion']
X = df.drop('conversion', axis=1)

In [53]:
X.loc[1]

recency                 6
history            329.08
used_discount           1
used_bogo               1
is_referral             1
channel               Web
offer            No Offer
score            0.961538
Name: 1, dtype: object

In [54]:
df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,is_referral,channel,offer,conversion,score
0,10,142.44,1,0,0,Phone,Buy One Get One,0,0.678715
1,6,329.08,1,1,1,Web,No Offer,0,0.961538
2,7,180.65,0,1,1,Web,Buy One Get One,0,0.205998
3,9,675.83,1,0,1,Web,Discount,0,1.21692
4,2,45.34,1,0,0,Web,Buy One Get One,0,1.389221


In [55]:
def predict(data):
    df = pd.DataFrame(data, index=[0])
    df.columns = ['recency', 'history', 'used_discount', 'used_bogo', 
                'is_referral', 'channel', 'offer', 'score']
    
    return {'result': loaded_model.predict(df)[0]}

In [56]:
data = {
  "recency": "6",
  "history": "329.08",
  "used_discount": "1",
  "used_bogo": "1",
  "is_referral": "1",
  "channel": "Web",
  "offer": "No Offer",
  "score": "1.3823"
}

In [57]:
predict(data)

{'result': 0}

In [58]:
categorical_features

['channel', 'offer']

In [59]:
df = pd.DataFrame(data, index=[0])
df.columns = ['recency', 'history', 'used_discount', 'used_bogo', 
                'is_referral', 'channel', 'offer', 'score']

In [60]:
df['recency'].astype(float)

0    6.0
Name: recency, dtype: float64