## LightGBM Model

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('instacart_pca.csv')
df.head()

Unnamed: 0,order_id,product_id,aisle_id,orders,order_number,days_reorder_ratio,days_since_prior_order,n_orders,order_hour_of_day,reorder_interval,total_items,reordered
0,1,0.0,0.0,0.0,4,9.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1.0,1.0,76.0,4,0.473684,0.0,10.0,0.0,19.0,11.0,1.0
2,1,2.0,2.0,4.0,4,0.552632,0.0,14.0,0.0,16.285714,31.0,0.0
3,1,3.0,2.0,6.0,4,0.75,0.0,12.0,0.0,12.0,0.0,0.0
4,1,4.0,3.0,22.0,4,0.529412,0.0,5.0,0.0,17.0,0.0,1.0


In [3]:
df.shape

(1384617, 12)

In [4]:
X = df.drop(['reordered'], axis=1)
y = df['reordered']
(X.shape, y.shape)

((1384617, 11), (1384617,))

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.3, random_state=42)
(X_train.shape, X_test.shape)

((969231, 11), (415386, 11))

In [7]:
import lightgbm as lgb
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [8]:
import time
RandomState = 101

In [9]:
d_train_X = lgb.Dataset(X_train)
d_train_y = lgb.Dataset(y_train)
d_test_X = lgb.Dataset(X_test)
d_test_y = lgb.Dataset(y_test)

In [10]:
# LightGBM model
param_grid_lgb = {
    'boosting_type': ['gbdt','dart'],
    'learning_rate': [0.1,0.5],
    'n_estimators': [50,100],
    }

model_lgb = lgb.LGBMClassifier( 
    objective='binary',  
    max_depth=8,
    feature_fraction=0.5,  
    bagging_fraction=0.8, 
    bagging_freq=15,
    num_leaves=30,
    reg_alpha=1.2,
    reg_lambda=1.4,
    max_bin=350,
    random_state=RandomState 
    )

grid_lgb = GridSearchCV(estimator=model_lgb, param_grid=param_grid_lgb, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

start_time = time.time()
grid_search_lgb = grid_lgb.fit(X_train, y_train)

print("Best: %f using %s" % (grid_search_lgb.best_score_, grid_search_lgb.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')  

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:  1.3min remaining:   15.3s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.3min finished


Best: 0.720721 using {'boosting_type': 'dart', 'learning_rate': 0.5, 'n_estimators': 100}
Execution time: 81.36806988716125 s


In [11]:
lgbm = grid_search_lgb.best_estimator_
prediction_lgb = lgbm.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test,prediction_lgb))
print("\nClassification Report:\n", classification_report(y_test,prediction_lgb))
print("Accuracy Score: ", accuracy_score(y_test,prediction_lgb))

Confusion Matrix:
 [[ 79725  87100]
 [ 48014 200547]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.62      0.48      0.54    166825
         1.0       0.70      0.81      0.75    248561

    accuracy                           0.67    415386
   macro avg       0.66      0.64      0.64    415386
weighted avg       0.67      0.67      0.67    415386

Accuracy Score:  0.6747266398000895


In [12]:
predproba_lgb = lgbm.predict_proba(X_test)
log_loss(y_test,predproba_lgb)

0.5953601013885027

In [13]:
import pickle

pickle.dump(lgbm, open('lgbm_pca.pickle', 'wb'))