In [1]:
%config Completer.use_jedi = False
%matplotlib inline
import pandas as pd
import numpy as np
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_rows', 100)

#Load train/test datasets

In [2]:
import pandas as pd

In [3]:
ds = pd.read_csv("../../data/dataset.csv")

In [4]:
ds.head()

Unnamed: 0,customer_id,order__count,order_amount_paid_log10__mean,order_amount_paid_log10__min,order_amount_paid_log10__max,order_amount_paid_log10__median,voucher_amount__mean,voucher_amount__min,voucher_amount__max,voucher_amount__median,...,failed_order__max,failed_order__sum,failed_order__mean,delivery_fee_not_zero__max,delivery_fee_not_zero__sum,delivery_fee_not_zero__mean,days_since_last_order,days_since_first_order,is_returning_customer,split
0,000097eabfd9,1,1.095853,1.095853,1.095853,1.095853,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0,0.0,619,619,0,train
1,0000e2c6d9be,1,1.023582,1.023582,1.023582,1.023582,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0,0.0,396,396,0,test
2,000133bb597f,1,0.841145,0.841145,0.841145,0.841145,0.0,0.0,0.0,0.0,...,0,0,0.0,1,1,1.0,2,2,1,val
3,00018269939b,1,1.034368,1.034368,1.034368,1.034368,0.0,0.0,0.0,0.0,...,0,0,0.0,1,1,1.0,23,23,0,train
4,0001a00468a6,1,0.788925,0.788925,0.788925,0.788925,0.0,0.0,0.0,0.0,...,0,0,0.0,1,1,1.0,574,574,0,train


In [5]:
ds.dtypes

customer_id                          object
order__count                          int64
order_amount_paid_log10__mean       float64
order_amount_paid_log10__min        float64
order_amount_paid_log10__max        float64
order_amount_paid_log10__median     float64
voucher_amount__mean                float64
voucher_amount__min                 float64
voucher_amount__max                 float64
voucher_amount__median              float64
order_voucher_percentage__mean      float64
order_voucher_percentage__min       float64
order_voucher_percentage__max       float64
order_voucher_percentage__median    float64
delivery_fee__mean                  float64
delivery_fee__min                   float64
delivery_fee__max                   float64
delivery_fee__median                float64
n_order_date_dayofweek_0            float64
n_order_date_dayofweek_1            float64
n_order_date_dayofweek_2            float64
n_order_date_dayofweek_3            float64
n_order_date_dayofweek_4        

All features are already numeric, no need to transform them

Let's check if there are NaN values

In [6]:
ds.replace([np.inf, -np.inf], np.nan, inplace=True) # we treat infinite values as NaN
ds.isna().sum()

customer_id                         0
order__count                        0
order_amount_paid_log10__mean       0
order_amount_paid_log10__min        0
order_amount_paid_log10__max        0
order_amount_paid_log10__median     0
voucher_amount__mean                0
voucher_amount__min                 0
voucher_amount__max                 0
voucher_amount__median              0
order_voucher_percentage__mean      1
order_voucher_percentage__min       1
order_voucher_percentage__max       1
order_voucher_percentage__median    1
delivery_fee__mean                  0
delivery_fee__min                   0
delivery_fee__max                   0
delivery_fee__median                0
n_order_date_dayofweek_0            0
n_order_date_dayofweek_1            0
n_order_date_dayofweek_2            0
n_order_date_dayofweek_3            0
n_order_date_dayofweek_4            0
n_order_date_dayofweek_5            0
n_order_date_dayofweek_6            0
n_order_hour_0                      0
n_order_hour

There is one row with some NaN values in train:

In [7]:
ds.loc[ds.order_voucher_percentage__mean.isna(),:]

Unnamed: 0,customer_id,order__count,order_amount_paid_log10__mean,order_amount_paid_log10__min,order_amount_paid_log10__max,order_amount_paid_log10__median,voucher_amount__mean,voucher_amount__min,voucher_amount__max,voucher_amount__median,...,failed_order__max,failed_order__sum,failed_order__mean,delivery_fee_not_zero__max,delivery_fee_not_zero__sum,delivery_fee_not_zero__mean,days_since_last_order,days_since_first_order,is_returning_customer,split
173440,b467aab6bf25,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0.0,0,0,0.0,634,634,0,train


We remove it:

In [8]:
ds = ds.loc[~ds.order_voucher_percentage__mean.isna(),:]
ds.shape

(245437, 62)

In [9]:
non_features_cols = ["customer_id", "is_returning_customer", "split"]
feature_cols = [c for c in ds.columns if c not in non_features_cols]

In [10]:
def get_features_and_labels(ds):
    X = ds[feature_cols] # features
    y = ds.is_returning_customer # labels
    return X, y

In [11]:
X_train, y_train = get_features_and_labels(ds.query("split == 'train'"))
X_val, y_val = get_features_and_labels(ds.query("split == 'test'"))

In [12]:
(X_train.shape, y_train.shape)

((172081, 59), (172081,))

In [13]:
y_train.dtype

dtype('int64')

In [14]:
(X_val.shape, y_val.shape)

((48916, 59), (48916,))

# Comparison of classifiers
Let's compare the out-of-the-box performance of the some common classifier algorithms

In [15]:
from sklearn.metrics import f1_score

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
%%time
clf = GradientBoostingClassifier(n_estimators=100,
                                 random_state=12345)\
            .fit(X_train, y_train)

y_pred = clf.predict(X_val)
f1_score(y_val, y_pred)

CPU times: user 47.4 s, sys: 17.8 ms, total: 47.4 s
Wall time: 47.4 s


0.5538688953323099

In [18]:
# Let's add sample weights
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
sample_weights

array([0.64615829, 0.64615829, 0.64615829, ..., 2.21047426, 0.64615829,
       0.64615829])

In [19]:
%%time
# Best
clf = GradientBoostingClassifier(n_estimators=100,
                                 random_state=12345)\
            .fit(X_train, y_train,
                sample_weight=sample_weights)
y_pred = clf.predict(X_val)
f1_score(y_val, y_pred)

CPU times: user 44.9 s, sys: 20 ms, total: 45 s
Wall time: 45 s


0.5838127818379723

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
%%time
clf = RandomForestClassifier(n_estimators=100,
                            random_state=12345)\
            .fit(X_train, y_train,
                sample_weight=sample_weights)
y_pred = clf.predict(X_val)
f1_score(y_val, y_pred)

CPU times: user 36.8 s, sys: 172 ms, total: 36.9 s
Wall time: 36.9 s


0.5307831598008148

In [22]:
%%time
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=12345)\
            .fit(X_train, y_train,
                sample_weight=sample_weights)
y_pred = clf.predict(X_val)
f1_score(y_val, y_pred)

CPU times: user 2.5 s, sys: 28 ms, total: 2.52 s
Wall time: 2.53 s


0.5630740528128588

In [None]:
## These are extremely slow for this dataset

In [23]:
# from sklearn import svm

In [None]:
# %%time
# clf = svm.SVC(random_state=12345, kernel='linear', C=1.0)\
#             .fit(X_train, y_train,
#                 sample_weight=sample_weights)
# y_pred = clf.predict(X_val)
# f1_score(y_val, y_pred)

In [None]:
# %%time
# clf = svm.SVC(random_state=12345, kernel='rbf', C=1.0)\
#             .fit(X_train, y_train,
#                 sample_weight=sample_weights)
# y_pred = clf.predict(X_val)
# f1_score(y_val, y_pred)