In [1]:
#all imports here..
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

#import plotly.express as px

from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, auc, roc_auc_score, mutual_info_score

import warnings
warnings.filterwarnings("ignore")

%xmode Minimal
%matplotlib notebook

Exception reporting mode: Minimal


In [2]:
df = pd.read_csv('online_shoppers_intention.csv')

In [None]:
df.shape

In [None]:
df.head().T

formatting to lowercase columns' titles and values

In [3]:
df.columns = df.columns.str.replace(r'([a-z])([A-Z])', r'\1_\2', regex=True).str.lower()

In [4]:
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings

['month', 'visitor_type']

In [5]:
for col in strings:
    df[col] = df[col].str.lower()

In [None]:
df.info()

<li>there are no missing values.</li>
<li>administrative, informational and product_related are number of pages</li>
<li>operating_system, browser, region, traffic_type are formatted as int64 but can be considered as categorical (they have no ordinal meaning)</li>

In [None]:
plt.figure(figsize=(15,10))  
sns.heatmap(df.corr(),annot=True,linewidths=.5, cmap="Blues")
plt.title('Heatmap showing correlations between numerical data')
plt.show()

In [11]:
numeric_features = ['administrative', 'administrative_duration', 'informational',
       'informational_duration', 'product_related', 'product_related_duration',
       'bounce_rates', 'exit_rates', 'page_values', 'special_day',
                    'operating_systems', 'browser', 'region', 'traffic_type']

categorical_features = list(df.dtypes[df.dtypes == 'object'].index)

boolean_features = ['weekend']

In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.revenue.values
y_val = df_val.revenue.values
y_test = df_test.revenue.values

del df_train['revenue']
del df_val['revenue']
del df_test['revenue']

In [13]:
df_train.shape, df_val.shape, df_test.shape

((7398, 17), (2466, 17), (2466, 17))

In [12]:
#one hot encoding of categorical features
dv = DictVectorizer(sparse=False)

train_dict = df_train[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

In [13]:
X_train.shape

(7398, 28)

In [None]:
dv.feature_names_

In [14]:
val_dict = df_val[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

**Estimating generalization error via nested cross_val. Note that nested cross-val is time consuming and it is suited for small datasets like ours that is under 10000 rows**

In [22]:
#because we are going to use cross-val there is no need to separate train and val

df_full_train = df_full_train.reset_index(drop=True)
y_full_train = df_full_train.revenue.values
del df_full_train['revenue']

In [23]:
df_full_train.shape

(9864, 17)

In [24]:
#one hot encoding of categorical features (non need of separated train and val, in view of cross-validation)
dv = DictVectorizer(sparse=False)

train_dict = df_full_train[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_full_train = dv.fit_transform(train_dict)

In [25]:
X_full_train.shape

(9864, 28)

In [None]:
dv.feature_names_

In [26]:
#first model is LogisticRegression()
model_cv = LogisticRegression(solver = 'lbfgs', max_iter = 10_000, 
                              random_state=1)

In [27]:
param_grid = [{
    'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10]}]

In [28]:
# Making a Grid Search for selecting model by hyperparameter tuning, using
# 2-fold StratifiedKFold cross validation

gs = GridSearchCV(estimator=model_cv, param_grid = param_grid,
                 cv = 2, scoring = 'roc_auc')

In [29]:
#(nested) cross validation ( 5 x 2 cross-val: inner = 5 folds, outer = 2 folds). Outer is in the cell above..
scores = cross_val_score(gs, X_full_train, y_full_train, scoring='roc_auc', cv=5, error_score='raise')

# mean scores and standard deviation
print('CV auc_roc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV auc_roc: 0.903 +/- 0.007


In [30]:
#second model is RandomForestClassifier()
model2_cv = RandomForestClassifier(random_state=1)

In [31]:
param_grid = [{
    'max_depth': [10, 15, 20],
    'n_estimators' : [100, 200, 500],
    'min_samples_leaf': [1, 3, 5, 10]}]

In [32]:
gs = GridSearchCV(estimator=model2_cv, param_grid = param_grid,
                 cv = 2, scoring = 'roc_auc')

In [33]:
scores = cross_val_score(gs, X_full_train, y_full_train, scoring='roc_auc', cv=5, error_score='raise')

print('CV auc_roc: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV auc_roc: 0.929 +/- 0.007


**Selecting the final model to use in deployment**

after having calculated a robust estimate of the generalization errors of the models, we continue with the better one (RandomForestClassifier), to find his optimal hyperparameters values by means of gridSearchCV, then we will define the best model to export to train.py for the deployment part

In [34]:
gs.fit(X_full_train, y_full_train)

In [35]:
gs.best_params_

{'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 200}

In [36]:
final_model = RandomForestClassifier(max_depth = 20, n_estimators = 200,
                                    min_samples_leaf = 5, random_state = 1)

In [37]:
final_model.fit(X_train, y_train)

In [43]:
test_val = df_val[numeric_features + categorical_features + boolean_features].to_dict(orient='records')
X_test = dv.transform(val_dict)

In [44]:
X_test.shape

(2466, 28)

In [38]:
y_pred = final_model.predict_proba(X_val)[:,1]

In [39]:
realized_revenue = (y_pred >= 0.5)

In [40]:
#accuracy of the RandomForestClassifier() model
(y_val == realized_revenue).mean()

0.9030819140308192

In [41]:
fpr3, tpr3, thresholds3 = roc_curve(y_val, y_pred)

In [42]:
#AUC metric
auc(fpr3, tpr3)

0.9391630848278248

In [45]:
y_test_pred = final_model.predict_proba(X_test)[:,1]

In [46]:
realized_revenue = (y_test_pred >= 0.5)

In [48]:
#accuracy of the RandomForestClassifier() model on test set
(y_test == realized_revenue).mean()

0.7761557177615572

In [49]:
fpr4, tpr4, thresholds4 = roc_curve(y_test, y_test_pred)

In [50]:
#AUC metric
auc(fpr4, tpr4)

0.5075778087598418