In [1]:
%config Completer.use_jedi = False
%matplotlib inline
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_rows', 100)

#Load train/validation/test datasets

In [2]:
import pandas as pd

In [3]:
ds = pd.read_csv("../../data/dataset.csv")

There is one row with some NaN values in train, we remove it

In [4]:
ds = ds.loc[~ds.order_voucher_percentage__mean.isna(),:]
ds.shape

(245437, 62)

In [5]:
non_features_cols = ["customer_id", "is_returning_customer", "split"]
feature_cols = [c for c in ds.columns if c not in non_features_cols]

In [6]:
def get_features_and_labels(ds):
    X = ds[feature_cols] # features
    y = ds.is_returning_customer # labels
    return X, y

In [7]:
X_train, y_train = get_features_and_labels(ds.query("split == 'train'"))
X_val, y_val = get_features_and_labels(ds.query("split == 'val'"))
X_test, y_test = get_features_and_labels(ds.query("split == 'test'"))

In [8]:
(X_train.shape, y_train.shape)

((172081, 59), (172081,))

In [9]:
(X_val.shape, y_val.shape)

((24440, 59), (24440,))

In [10]:
(X_test.shape, y_test.shape)

((48916, 59), (48916,))

In [32]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
sample_weights

array([0.64615829, 0.64615829, 0.64615829, ..., 2.21047426, 0.64615829,
       0.64615829])

# Hyperparameter tuning

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
#creating Scoring parameter: 

scoring = {'f1': make_scorer(f1_score)}

In [12]:
X_train.shape

(172081, 59)

In [52]:
%%time
# A sample parameter
parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.1],
    "min_samples_split": [200, 500],
    "min_samples_leaf": [20, 50],
    "max_depth":[3, 8],
    "max_features":["log2","sqrt"],
    "n_iter_no_change": [None, 10],
    "subsample":[0.5, 1.0],
    "n_estimators":[100, 200]
    }
#passing the scoring function in the GridSearchCV
clf = GridSearchCV(GradientBoostingClassifier(random_state=12345),
                   parameters,
                   scoring="f1",
                   refit=True,
                   cv=3,
                   n_jobs=-1,
                  verbose=10)

clf.fit(X_train, y_train, sample_weight=sample_weights)

Fitting 3 folds for each of 256 candidates, totalling 768 fits
CPU times: user 24.2 s, sys: 1.73 s, total: 25.9 s
Wall time: 50min 5s


GridSearchCV(cv=3, estimator=GradientBoostingClassifier(random_state=12345),
             n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.1], 'loss': ['deviance'],
                         'max_depth': [3, 8], 'max_features': ['log2', 'sqrt'],
                         'min_samples_leaf': [20, 50],
                         'min_samples_split': [200, 500],
                         'n_estimators': [100, 200],
                         'n_iter_no_change': [None, 10],
                         'subsample': [0.5, 1.0]},
             scoring='f1', verbose=10)

In [53]:
#converting the clf.cv_results to dataframe
cv_res = pd.DataFrame.from_dict(clf.cv_results_)

In [54]:
cv_res.sort_values("mean_test_score", ascending=False).head(50)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_loss,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,param_n_iter_no_change,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
241,40.07677,1.303598,0.603188,0.070026,0.1,deviance,8,sqrt,50,200,100,,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581028,0.580682,0.58822,0.58331,0.003475,1
232,24.630041,0.806103,0.590251,0.026212,0.1,deviance,8,sqrt,20,500,100,,0.5,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.58099,0.580247,0.588564,0.583267,0.003758,2
239,36.631706,2.480807,0.539152,0.027577,0.1,deviance,8,sqrt,20,500,200,10.0,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.579818,0.580183,0.589472,0.583157,0.004467,3
235,36.691146,2.613661,0.499621,0.034921,0.1,deviance,8,sqrt,20,500,100,10.0,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.579818,0.580183,0.589472,0.583157,0.004467,3
240,25.097409,0.655284,0.621689,0.083683,0.1,deviance,8,sqrt,50,200,100,,0.5,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581635,0.578453,0.58888,0.582989,0.004363,5
245,78.193875,2.798028,1.055936,0.039467,0.1,deviance,8,sqrt,50,200,200,,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581233,0.579606,0.588112,0.582984,0.003687,6
208,20.223421,0.309719,0.600116,0.03382,0.1,deviance,8,log2,50,200,100,,0.5,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581174,0.579159,0.588465,0.582933,0.003997,7
255,26.945092,1.569878,0.280071,0.037537,0.1,deviance,8,sqrt,50,500,200,10.0,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581084,0.579068,0.58855,0.582901,0.004079,8
251,35.030608,1.769471,0.522795,0.067405,0.1,deviance,8,sqrt,50,500,100,10.0,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581084,0.579068,0.58855,0.582901,0.004079,8
221,63.816994,0.891411,1.108312,0.054456,0.1,deviance,8,log2,50,500,200,,1.0,"{'learning_rate': 0.1, 'loss': 'deviance', 'ma...",0.581184,0.579255,0.588255,0.582898,0.003869,10


In [60]:
import json

best_row = cv_res.loc[cv_res.mean_test_score.argmax(),:]
best_params = best_row["params"]
best_mean_test_score = best_row["mean_test_score"]
print(best_mean_test_score)
print(json.dumps(best_params, indent=2))

0.5833096963962175
{
  "learning_rate": 0.1,
  "loss": "deviance",
  "max_depth": 8,
  "max_features": "sqrt",
  "min_samples_leaf": 50,
  "min_samples_split": 200,
  "n_estimators": 100,
  "n_iter_no_change": null,
  "subsample": 1.0
}


In [59]:
# Score on validation set
y_pred = clf.predict(X_val)
f1_score(y_val, y_pred)

0.5871144084342054

In [61]:
# Persist model
from joblib import dump
dump(clf, 'fitted_clf.joblib')

['fitted_clf.joblib']