In [1]:
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestClassifier

import data_prep as dp

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = \
    dp.X_train, dp.X_val, dp.X_test, dp.y_train, dp.y_val, dp.y_test
# DictionaryVectorizer
dv = dp.dv

features = dv.get_feature_names()
# matrix for xgboost
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

Decision Tree final model

In [4]:
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)

Random Forest final model

In [5]:
rf = RandomForestClassifier(n_estimators=90,
                            max_depth=10, 
                            min_samples_leaf=3,
                            random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=3, n_estimators=90,
                       random_state=42)

XGBoost final model

In [6]:
xgb_params = {
    'eta': 0.1, # learning rate
    'max_depth': 3,
    'min_child_weight': 30, # min_samples_leaf

    'objective': 'binary:logistic', # specify that we have a binary classification model
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1 # show the process 0, 1, 2
}
model = xgb.train(xgb_params, dtrain, num_boost_round=155)

Check models performance on validation set

In [8]:
# decision tree
y_pred = dt.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.7854738320699426

In [9]:
# random forest
y_pred = rf.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

0.8286945096994501

In [11]:
# xgboost
y_pred = model.predict(dval)
roc_auc_score(y_val, y_pred)

0.8372435881911312

Train the best model on the full train data (train + validation)

In [16]:
X_train.shape

(2672, 29)

In [19]:
X_train.shape[0] + X_val.shape[0]

3563

In [18]:
np.append(X_train, X_val, axis=0).shape

(3563, 29)

In [20]:
X_full = np.append(X_train, X_val, axis=0)

In [22]:
# check if correct
X_full[0] == X_train[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [23]:
X_full[2671] == X_train[-1]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [24]:
X_full[2672] == X_val[0]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [29]:
np.append(y_train, y_val).shape

(3563,)

In [30]:
y_full = np.append(y_train, y_val)

In [32]:
# check if correct
y_full[0] == y_train[0],\
    y_full[2671] == y_train[-1],\
        y_full[2672] == y_val[0]

(True, True, True)