In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,balanced_accuracy_score,confusion_matrix
from sklearn.utils import resample

In [2]:
product_to_predict='oil'
df=pd.read_csv('../raw data/common purchases.csv',index_col='Member_number')

y=df[product_to_predict]
df.drop(columns=[product_to_predict],inplace=True)
X=df

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2)

We have an unbalanced dataset.  Here we join the training data, upsample the minority case, and then un-join.
This will prevent the algorithms from fixating on one case during training.

In [3]:
train_df=X_train.join(y_train,on='Member_number',how='outer')
minority_df=train_df[train_df[product_to_predict]==1]
majority_df=train_df[train_df[product_to_predict]==0]
minority_df=resample(minority_df,replace=True,n_samples=len(majority_df))
upsampled_df=pd.concat([majority_df,minority_df])
y_train=upsampled_df[product_to_predict]
X_train=upsampled_df.drop(columns=[product_to_predict])

X_train=X_train.reset_index(drop=True).to_numpy()
X_test=X_test.reset_index(drop=True).to_numpy()
y_train=y_train.reset_index(drop=True).to_numpy()
y_test=y_test.reset_index(drop=True).to_numpy()

In [4]:
rf_params={'criterion':['gini','entropy'],
          'n_estimators':[10,100,1000]}
rf_model=GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced'),
                      param_grid=rf_params,
                      scoring='balanced_accuracy', 
                      cv=5)
rf_model.fit(X_train,y_train)
rf_pred=rf_model.predict(X_test)
print('Best Parameters: ',rf_model.best_params_,'\n')
print(classification_report(y_test,rf_pred))
print(confusion_matrix(y_test,rf_pred))

Best Parameters:  {'criterion': 'entropy', 'n_estimators': 1000} 

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       743
         1.0       0.00      0.00      0.00        37

    accuracy                           0.95       780
   macro avg       0.48      0.50      0.49       780
weighted avg       0.91      0.95      0.93       780

[[738   5]
 [ 37   0]]



Random Forest fixated on only one class and seems to be worthless.


In [5]:
knn_params={'n_neighbors':range(1,10),
           'weights':['uniform', 'distance'],
           'p':[1,2]}
knn_model=GridSearchCV(estimator=KNeighborsClassifier(),
                      param_grid=knn_params,
                      scoring='balanced_accuracy', 
                      cv=5)
knn_model.fit(X_train,y_train)
knn_pred=knn_model.predict(X_test)
print('Best Parameters: ',knn_model.best_params_,'\n')
print(classification_report(y_test,knn_pred))
print(confusion_matrix(y_test,knn_pred))

Best Parameters:  {'n_neighbors': 2, 'p': 1, 'weights': 'uniform'} 

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97       743
         1.0       0.00      0.00      0.00        37

    accuracy                           0.94       780
   macro avg       0.48      0.50      0.49       780
weighted avg       0.91      0.94      0.92       780

[[736   7]
 [ 37   0]]



K-Nearest Neighbors also fixated on one class, only doing narrowly better at recall on the product.


In [6]:
logres_params={'penalty':['l1', 'l2', 'elasticnet'],
              'solver':['saga'],
              'l1_ratio':[1]}
logres_model=GridSearchCV(estimator=LogisticRegression(class_weight='balanced'),
                        param_grid=logres_params,
                        scoring='roc_auc', 
                        cv=5)
logres_model.fit(X_train,y_train)
logres_pred=logres_model.predict(X_test)
print('Best Parameters: ',logres_model.best_params_,'\n')
print(classification_report(y_test,logres_pred))
print(confusion_matrix(y_test,logres_pred))



Best Parameters:  {'l1_ratio': 1, 'penalty': 'l1', 'solver': 'saga'} 

              precision    recall  f1-score   support

         0.0       0.95      0.65      0.77       743
         1.0       0.04      0.32      0.08        37

    accuracy                           0.64       780
   macro avg       0.50      0.49      0.43       780
weighted avg       0.91      0.64      0.74       780

[[485 258]
 [ 25  12]]





LogRes has something we can work with, but its recall is a bit low.  We don't care much about false positives in
this business decision, so let's drop the cut-off value a bit.


In [7]:
logres_probs=logres_model.predict_proba(X_test)
new_pred=[1.0 if prob>.40 else 0.0 for prob in logres_probs[:,1]]
print(classification_report(y_test,new_pred))
print(confusion_matrix(y_test,new_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.48      0.64       743
         1.0       0.05      0.51      0.09        37

    accuracy                           0.48       780
   macro avg       0.50      0.50      0.36       780
weighted avg       0.91      0.48      0.61       780

[[356 387]
 [ 18  19]]


With the new cutt-off we can accurately flag 51% of would-be purchasers, while still correctly identifying
356 people who likely wouldn't have bought the product anyway.  This is actionable.

Final model: LogRes with 'l1_ratio': 1, 'penalty': 'l1', 'solver': 'saga' and cutt-off probability of 40%.