In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, roc_auc_score, confusion_matrix

In [35]:
df = pd.read_csv("superstore_data.csv")

In [62]:
df

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Response,Complain,Age,Education_combined,Marital_combined
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,...,1,4,4,6,1,1,0,53,Low,Divorced
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,...,1,7,3,7,5,1,0,62,Low,Single
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,...,1,3,2,5,2,0,0,65,Low,Married
3,1386,1967,Graduation,Together,32474.0,1,1,11/5/2014,0,10,...,1,1,0,2,7,0,0,56,Low,Together
4,5371,1989,Graduation,Single,21474.0,1,0,8/4/2014,0,6,...,2,3,1,2,7,1,0,34,Low,Single
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142,1976,PhD,Divorced,66476.0,0,1,7/3/2013,99,372,...,2,5,2,11,4,0,0,47,High,Divorced
2236,5263,1977,2n Cycle,Married,31056.0,1,0,1/22/2013,99,5,...,1,1,0,3,8,0,0,46,Low,Married
2237,22,1976,Graduation,Divorced,46310.0,1,0,3/12/2012,99,185,...,2,6,1,5,8,0,0,47,Low,Divorced
2238,528,1978,Graduation,Married,65819.0,0,0,11/29/2012,99,267,...,1,5,4,10,3,0,0,45,Low,Married


# Data Cleaning

1. Adjust existing features and add new ones
2. Handle missing values in Income field
3. Apply one-hot encoding to categorical variables
4. Split data into training and testing sets
5. Standardize data
6. Resolve class imbalance for Response target variable 

In [36]:
#Convert birth year to customer age
df['Age'] = datetime.date.today().year - df['Year_Birth']

In [46]:
#Combine education levels
#Masters and PhD are High level, rest are labeled as Low
def education_transform(level):
    if level in ['PhD', 'Master']:
        return 'High'
    return 'Low'
df['Education_combined'] = df['Education'].map(education_transform)

In [55]:
#Combine marital status levels
#Adding Alone, YOLO, and Absurd to Single
#Combining Divorced and Widow
def marital_adjust(s):
    if s in ['Alone', 'YOLO', 'Absurd']:
        return 'Single'
    elif s in ['Widow']:
        return 'Divorced'
    else:
        return s
df['Marital_combined'] = df['Marital_Status'].map(marital_adjust)

In [63]:
#Adding year joined field based on Dt_Customer
#Adding number of years individual has been a customer
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])
df['Year_Joined'] = pd.DatetimeIndex(df["Dt_Customer"]).year
df['Years_Customer'] = datetime.date.today().year - df['Year_Joined']

In [37]:
#Replacing missing Income values with Iterative Imputer
imputer = IterativeImputer(random_state = 10)
df['Income'] = imputer.fit_transform(df[['Income']])
df.isna().sum()

Id                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
Response               0
Complain               0
Age                    0
dtype: int64

In [7]:
#One-hot encoding for categorical variables
df = pd.get_dummies(df, columns = ['Education', 'Marital_Status'], drop_first = True)


In [8]:
#Split data using train/test split
X = df.drop(columns = ['Response', 'Id', 'Dt_Customer'], axis = 1)
y = df['Response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10, stratify = y)

In [9]:
#Standardize Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
#Handle class imbalance using SMOTE
y_train.value_counts()
oversample = SMOTE()
X_train_scaled, y_train = oversample.fit_resample(X_train_scaled, y_train)

In [11]:
df['Response'].value_counts()

0    1906
1     334
Name: Response, dtype: int64

# Model Development 

In [12]:
#List containing each model
model_list = []

In [13]:
#Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

In [14]:
#Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_scaled, y_train)

In [15]:
#Random Forest Classifier
rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)

In [16]:
#KNeighbors Classifier
knn_model = KNeighborsClassifier(n_neighbors = 20)
knn_model.fit(X_train_scaled, y_train)

In [23]:
#Gradient Boost Classifier
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective = "binary:logistic", learning_rate = 0.05, n_estimators = 1000,
                              max_depth = 6, random_state = 42)
xgb_model.fit(X_train_scaled, y_train)

In [24]:
model_list = [lr_model, dt_model, rf_model, knn_model, xgb_model]

# GridSearch to fine-tune parameters

In [19]:
from sklearn.model_selection import GridSearchCV
search_param = {'max_depth': [3, 6, 10],
                'learning_rate': [.01, .05, .1, .3],
                'n_estimators': [100, 500, 1000]}
GS = GridSearchCV(estimator = xgb_model,
                 param_grid = search_param,
                 scoring = ['roc_auc', 'accuracy'],
                 refit = 'roc_auc', 
                 cv = 5,
                 verbose = 4)
GS.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=100; accuracy: (test=0.770) roc_auc: (test=0.859) total time=   0.3s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=100; accuracy: (test=0.848) roc_auc: (test=0.933) total time=   0.3s
[CV 3/5] END learning_rate=0.01, max_depth=3, n_estimators=100; accuracy: (test=0.821) roc_auc: (test=0.913) total time=   0.4s
[CV 4/5] END learning_rate=0.01, max_depth=3, n_estimators=100; accuracy: (test=0.869) roc_auc: (test=0.936) total time=   0.3s
[CV 5/5] END learning_rate=0.01, max_depth=3, n_estimators=100; accuracy: (test=0.856) roc_auc: (test=0.945) total time=   0.3s
[CV 1/5] END learning_rate=0.01, max_depth=3, n_estimators=500; accuracy: (test=0.793) roc_auc: (test=0.908) total time=   1.5s
[CV 2/5] END learning_rate=0.01, max_depth=3, n_estimators=500; accuracy: (test=0.911) roc_auc: (test=0.979) total time=   1.4s
[CV 3/5] END learning_rate=0.01, max_depth

[CV 5/5] END learning_rate=0.05, max_depth=6, n_estimators=100; accuracy: (test=0.934) roc_auc: (test=0.991) total time=   0.6s
[CV 1/5] END learning_rate=0.05, max_depth=6, n_estimators=500; accuracy: (test=0.716) roc_auc: (test=0.956) total time=   2.7s
[CV 2/5] END learning_rate=0.05, max_depth=6, n_estimators=500; accuracy: (test=0.962) roc_auc: (test=0.995) total time=   2.9s
[CV 3/5] END learning_rate=0.05, max_depth=6, n_estimators=500; accuracy: (test=0.952) roc_auc: (test=0.996) total time=   2.9s
[CV 4/5] END learning_rate=0.05, max_depth=6, n_estimators=500; accuracy: (test=0.967) roc_auc: (test=0.996) total time=   2.9s
[CV 5/5] END learning_rate=0.05, max_depth=6, n_estimators=500; accuracy: (test=0.944) roc_auc: (test=0.995) total time=   2.9s
[CV 1/5] END learning_rate=0.05, max_depth=6, n_estimators=1000; accuracy: (test=0.726) roc_auc: (test=0.959) total time=   4.6s
[CV 2/5] END learning_rate=0.05, max_depth=6, n_estimators=1000; accuracy: (test=0.964) roc_auc: (test=

[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=500; accuracy: (test=0.962) roc_auc: (test=0.995) total time=   3.5s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_estimators=500; accuracy: (test=0.941) roc_auc: (test=0.992) total time=   3.4s
[CV 1/5] END learning_rate=0.1, max_depth=10, n_estimators=1000; accuracy: (test=0.754) roc_auc: (test=0.958) total time=   4.2s
[CV 2/5] END learning_rate=0.1, max_depth=10, n_estimators=1000; accuracy: (test=0.966) roc_auc: (test=0.995) total time=   5.7s
[CV 3/5] END learning_rate=0.1, max_depth=10, n_estimators=1000; accuracy: (test=0.956) roc_auc: (test=0.994) total time=   5.7s
[CV 4/5] END learning_rate=0.1, max_depth=10, n_estimators=1000; accuracy: (test=0.961) roc_auc: (test=0.996) total time=   5.9s
[CV 5/5] END learning_rate=0.1, max_depth=10, n_estimators=1000; accuracy: (test=0.943) roc_auc: (test=0.992) total time=   5.6s
[CV 1/5] END learning_rate=0.3, max_depth=3, n_estimators=100; accuracy: (test=0.695) roc_auc: (tes

In [20]:
print(GS.best_estimator_, '\n', GS.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=42, ...) 
 0.9879957000806237


In [25]:
def metric_calc(model_list, X_test_scaled, y_test):
    for elem in model_list:
        print(str(elem))
        model_predictions = elem.predict(X_test_scaled)
        print("Training Accuracy: " + str(elem.score(X_train_scaled, y_train)))
        print("Test Accuracy: " + str(accuracy_score(y_test, model_predictions)))
        print("AUC Score: " + str(roc_auc_score(y_test, model_predictions)))
        print(classification_report(y_test, model_predictions))

In [26]:
metric_calc(model_list, X_test_scaled, y_test)

LogisticRegression()
Training Accuracy: 0.8095081967213115
Test Accuracy: 0.7767857142857143
AUC Score: 0.7457593920162965
              precision    recall  f1-score   support

           0       0.94      0.79      0.86       381
           1       0.37      0.70      0.48        67

    accuracy                           0.78       448
   macro avg       0.65      0.75      0.67       448
weighted avg       0.85      0.78      0.80       448

DecisionTreeClassifier()
Training Accuracy: 0.9957377049180328
Test Accuracy: 0.7946428571428571
AUC Score: 0.6640028205429546
              precision    recall  f1-score   support

           0       0.90      0.85      0.88       381
           1       0.36      0.48      0.41        67

    accuracy                           0.79       448
   macro avg       0.63      0.66      0.64       448
weighted avg       0.82      0.79      0.81       448

RandomForestClassifier()
Training Accuracy: 0.9957377049180328
Test Accuracy: 0.8794642857142857