## XGBoost Model

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('instacart_pca.csv')
df.head()

Unnamed: 0,order_id,product_id,aisle_id,orders,order_number,days_reorder_ratio,days_since_prior_order,n_orders,order_hour_of_day,reorder_interval,total_items,reordered
0,1,0.0,0.0,0.0,4,9.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1.0,1.0,76.0,4,0.473684,0.0,10.0,0.0,19.0,11.0,1.0
2,1,2.0,2.0,4.0,4,0.552632,0.0,14.0,0.0,16.285714,31.0,0.0
3,1,3.0,2.0,6.0,4,0.75,0.0,12.0,0.0,12.0,0.0,0.0
4,1,4.0,3.0,22.0,4,0.529412,0.0,5.0,0.0,17.0,0.0,1.0


In [3]:
df.shape

(1384617, 12)

In [4]:
X = df.drop(['reordered'], axis=1)
y = df['reordered']
(X.shape, y.shape)

((1384617, 11), (1384617,))

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.3, random_state=42)
(X_train.shape, X_test.shape)

((969231, 11), (415386, 11))

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [8]:
import time
RandomState = 101

In [None]:
# XGBoost Classifier model
param_grid_xgb = {
    'n_estimators': [100, 200, 400],
    'max_depth': [4, 10],
    'gamma': [0, 0.5, 1, 2],
    'learning_rate': [0.001, 0.01, 0.1]
}

xgb = XGBClassifier(objective='binary:logistic', subsample=.8, colsample_bytree=.8, nthreads=-1, seed=RandomState)
grid_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid_xgb, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

start_time = time.time()
grid_search_xgb = grid_xgb.fit(X_train, y_train)

print("Best: %f using %s" % (grid_search_xgb.best_score_, grid_search_xgb.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 80.1min
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed: 131.3min finished


Parameters: { nthreads } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Best: 0.733067 using {'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 400}
Execution time: 8006.329144477844 s


In [None]:
xgbm = grid_search_xgb.best_estimator_
prediction_xgb = xgbm.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test,prediction_xgb))
print("\nClassification Report:\n", classification_report(y_test,prediction_xgb))
print("Accuracy Score: ", accuracy_score(y_test,prediction_xgb))

Confusion Matrix:
 [[ 84097  82728]
 [ 47559 201002]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.64      0.50      0.56    166825
         1.0       0.71      0.81      0.76    248561

    accuracy                           0.69    415386
   macro avg       0.67      0.66      0.66    415386
weighted avg       0.68      0.69      0.68    415386

Accuracy Score:  0.6863471566205891


In [None]:
predproba_xgb = xgbm.predict_proba(X_test)
log_loss( y_test, predproba_xgb)

0.5832221117533418

In [None]:
import pickle
pickle.dump(xgbm, open('xgbm_pca.pickle', 'wb'))