In [None]:
#all imports here..
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import auc, roc_auc_score, mutual_info_score, roc_curve

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
df = pd.read_csv('online_shoppers_intention.csv')

**EDA**

In [None]:
df.shape

In [None]:
df.head().T

formatting to lowercase columns' titles and values

In [None]:
df.columns = df.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2', regex=True).str.lower()

In [None]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

In [None]:
for col in strings:
    df[col] = df[col].str.lower()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure()
df['revenue'].value_counts().plot(kind = 'bar')
plt.xlabel('gained revenue')
plt.ylabel('count')
plt.show()

In [None]:
(df['revenue'] == False).sum(), (df['revenue'] == True).sum()

<li>there are no missing values.</li>
<li>the target class ('revenue') is unbalanced</li>
<li>administrative, informational and product_related are number of pages</li>

In [None]:
plt.figure(figsize=(15,10))  
sns.heatmap(df.corr(),annot=True,linewidths=.5, cmap="Blues")
plt.title('Heatmap showing correlations between numerical data')
plt.show()

In [None]:
numeric_features = ['administrative', 'administrative_duration', 'informational',
       'informational_duration', 'product_related', 'product_related_duration',
       'bounce_rates', 'exit_rates', 'page_values', 'special_day',
                    'operating_systems', 'browser', 'region', 'traffic_type']

categorical_features = list(df.dtypes[df.dtypes == 'object'].index)

boolean_features = ['weekend']

len(numeric_features), len(categorical_features), len(boolean_features)

In [None]:
#cox box analysis
g = sns.boxplot(data=df[numeric_features], orient="h")
g.set(xscale="log")  # log scale on x axes, otherwise figure dominated by product_related duration range
g

In [None]:
target_name = 'revenue'  #True if session ended in a buy

data, target = df.drop(columns=target_name), df[target_name]
target = (target == True).astype(int)

data.shape, target.shape

In [None]:
#in splitting, adopted stratify = <target column> , because of the unbalanced distribution

df_train, df_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=1,
                                         stratify = target)

In [None]:
df_train.shape, df_test.shape, y_train.shape, y_test.shape

In [None]:
#ROC-AUC score for evaluation of feature importance (numeric features only)

feat_list = []

for c in numeric_features:
    auc = roc_auc_score(y_train, df_train[c])
    if auc < 0.5:
        auc = roc_auc_score(y_train, -df_train[c])
#    print('%9s, %.3f' % (c, auc))
    feat_list.append((c, auc))

sorted(feat_list, key=lambda tup: tup[1], reverse=True)

In [None]:
#calculating mutual information to assess feature importance of categorical features

def calculate_mi(series):
    return mutual_info_score(series, y_train)

df_mi = df_train[categorical_features].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')
df_mi

**Estimating generalization error via nested cross validation.**

Note that nested cross-val is time consuming and it is suited for small datasets like ours that is under 10000 rows

First model is LogisticRegression()

In [None]:
train_dict = df_train[numeric_features + categorical_features + boolean_features].to_dict(orient='records')

In [None]:
pipe_lr = Pipeline([
    ('vectorizer', DictVectorizer(sparse = False)),
    ('clf_lr', LogisticRegression(max_iter = 10_000))
])

In [None]:
param_grid = {
    'clf_lr__C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10]}

In [None]:
# Making a Grid Search for selecting model by hyperparameter tuning, using
# 2-fold StratifiedKFold cross validation

gs = GridSearchCV(estimator= pipe_lr, param_grid = param_grid,
                 cv = 2, scoring = 'roc_auc')

In [None]:
#(nested) cross validation ( 5 x 2 cross-val: inner = 2 folds, outer = 5 folds). Inner is in the cell above..
#No need of separatation in train and val, in view of the use of cross-validation

scores = cross_val_score(gs, train_dict, y_train, scoring='roc_auc', cv=5, error_score='raise')

# mean scores and standard deviation
print('CV auc_roc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

Second model is RandomForestClassifier()

In [None]:
pipe_rf = Pipeline([
    ('vectorizer', DictVectorizer(sparse = False)),
    ('clf_rf', RandomForestClassifier())
])

In [None]:
param_grid = {
    'clf_rf__max_depth': [10, 15, 20],
    'clf_rf__n_estimators' : [100, 200, 500],
    'clf_rf__min_samples_leaf': [1, 3, 5, 10]}

In [None]:
gs = GridSearchCV(estimator=pipe_rf, param_grid = param_grid,
                 cv = 2, scoring = 'roc_auc')

In [None]:
scores = cross_val_score(gs, train_dict, y_train, scoring='roc_auc', cv=5, error_score='raise')

print('CV auc_roc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

**Selecting the final model to use in deployment**

after having calculated a robust estimate of the generalization errors of the models, we continue with the better one (RandomForestClassifier), to find his optimal hyperparameters values by means of gridSearchCV, then we will define the best model to adopt for the deployment part

In [None]:
dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)

In [None]:
param_grid = {
    'max_depth': [10, 15, 20],
    'n_estimators' : [100, 200, 500],
    'min_samples_leaf': [1, 3, 5, 10]}

In [None]:
gs2 = GridSearchCV(estimator=RandomForestClassifier(random_state=1), param_grid = param_grid,
                 cv = 5, scoring = 'roc_auc')

In [None]:
gs2.fit(X_train, y_train)

In [None]:
gs2.best_params_

**Final model**

In [None]:
final_model = RandomForestClassifier(max_depth = 20, n_estimators = 500,
                                    min_samples_leaf = 10, random_state = 1)

In [None]:
final_model.fit(X_train, y_train)

In [None]:
final_model.classes_

In [None]:
test_dict = df_test[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [None]:
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)[:,1]

In [None]:
fpr, tpr, threshold = roc_curve(y_test, y_pred)

In [None]:
metrics.auc(fpr, tpr)

In [None]:
plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='Final Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

## Bento ML

In [None]:
import bentoml

In [None]:
bentoml.sklearn.save_model(
    'shopper_intention',
    final_model,
    custom_objects={
        'dictVectorizer': dv
    },
    signatures = {
        'predict_proba':{
                'batchable': True,
                'batch_dim': 0
        }        
    }
    )

**Create Test input**

In [None]:
test_dict = df_test[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

In [None]:
X_test.shape

In [None]:
import json

In [None]:
request = df_test.iloc[0].to_dict()
print(json.dumps(request, indent=2))

In [None]:
print(y_test[0])

In [None]:
final_model.predict_proba([X_test[0,:]])

In [None]:
df.index[df['revenue'] == True].tolist()[0]

In [None]:
request = df_test.iloc[65].to_dict()
print(json.dumps(request, indent=2))

In [None]:
print(y_test[65])