## Logistic Regression Model

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_csv('instacart_pca.csv')
df.head()

Unnamed: 0,order_id,product_id,aisle_id,orders,order_number,days_reorder_ratio,days_since_prior_order,n_orders,order_hour_of_day,reorder_interval,total_items,reordered
0,1,0.0,0.0,0.0,4,9.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1.0,1.0,76.0,4,0.473684,0.0,10.0,0.0,19.0,11.0,1.0
2,1,2.0,2.0,4.0,4,0.552632,0.0,14.0,0.0,16.285714,31.0,0.0
3,1,3.0,2.0,6.0,4,0.75,0.0,12.0,0.0,12.0,0.0,0.0
4,1,4.0,3.0,22.0,4,0.529412,0.0,5.0,0.0,17.0,0.0,1.0


In [3]:
df.shape

(1384617, 12)

In [4]:
X = df.drop(['reordered'], axis=1)
y = df['reordered']
(X.shape, y.shape)

((1384617, 11), (1384617,))

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.3, random_state=42)
(X_train.shape, X_test.shape)

((969231, 11), (415386, 11))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [8]:
import time
RandomState = 101

In [9]:
# Logistic Regression model
param_grid_lr = {'dual': [True, False],
                 'max_iter': [10, 50, 100],
                 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
                 'penalty': ['l1', 'l2'],
                 'solver': ['liblinear', 'saga']}

lr = LogisticRegression(random_state=RandomState)
grid_lr = GridSearchCV(estimator=lr, param_grid=param_grid_lr, scoring='roc_auc', cv=3, n_jobs=-1, verbose=1)

start_time = time.time()
grid_search_lr = grid_lr.fit(X_train, y_train)

# Summarize results
print("Best: %f using %s" % (grid_search_lr.best_score_, grid_search_lr.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' s')

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed: 53.7min finished


Best: 0.672130 using {'C': 0.001, 'dual': False, 'max_iter': 50, 'penalty': 'l1', 'solver': 'liblinear'}
Execution time: 3228.62921333313 s


In [10]:
lrm = grid_search_lr.best_estimator_
prediction_lr = lrm.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test,prediction_lr))
print("\nClassification Report:\n", classification_report(y_test,prediction_lr))
print("Accuracy Score: ", accuracy_score(prediction_lr,y_test))

Confusion Matrix:
 [[ 52079 114746]
 [ 35795 212766]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.59      0.31      0.41    166825
         1.0       0.65      0.86      0.74    248561

    accuracy                           0.64    415386
   macro avg       0.62      0.58      0.57    415386
weighted avg       0.63      0.64      0.61    415386

Accuracy Score:  0.6375876895225164


In [11]:
predproba_lr = lrm.predict_proba(X_test)
log_loss(y_test,predproba_lr)

0.6313850112732169

In [12]:
import pickle

pickle.dump(lrm, open('lrm_pca.pickle', 'wb'))