# Comparing Classification Models

We want to compare various classification models for the customer dataset. We'll look at logistic regression, LDA, QDA, and $k$-nearest neighbors.

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# We'll look at logistic regression, LDA, QDA, and KNN
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

# Need to import tree stuff
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz, export_text, plot_tree

# Ensemble methods for trees
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

# We'll standardize and split data into training/testing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# import RFECV class
from sklearn.feature_selection import RFECV

# Need to measure "goodness"
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score, average_precision_score, precision_score
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay

In [None]:
# Read in the data and print out its shape
cust = pd.read_csv('./data/customers_clean.csv')
print(cust.shape)

In [None]:
# Let's drop the following columns:
# cust_id, join_date, last_purchase_date
new_cust = cust.drop(columns=['cust_id','join_date','last_purchase_date'])
new_cust.info()

In [None]:
# Create dummies and save in new DataFrame
data = pd.get_dummies(new_cust, dtype=int, drop_first=True)
data.info()

## Create "big spender"

We want to convert the $y$ variable, `spend`, into a binary variable where 1 represents a big spender and a 0 otherwise. We can set the cutoff anywhere we want. Looking at the summary statistics from above, let's use \$4,700 (what you told me to do) as the cutoff. We can use the function `pd.cut()`.

In [None]:
data['big_spender'] = pd.cut(data.spend, bins=[0,5700,10000], right=True, labels=[0,1]).astype(int)
data.info()

In [None]:
# See summary statistics
data.describe()

## Split Data into Training and Test Sets

Need to first define `X` and `y`. Then we can try train/test split.

In [None]:
# define the output variable, y
y = data.big_spender

# define the X
# Notice this time we are only dropping the original 'spend'
# and the 'big_spender' columns.
X = data.drop(columns=['spend', 'big_spender'])

In [None]:
# Time to split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=163)

In [None]:
# What is the percentage of big spenders in both training and test?
print(f'Training percentage of big spenders is {y_train.mean():.2%}')
print(f'Testing percentage of big spenders is  {y_test.mean():.2%}')

## Scale the Data

Fit only on the training set. Use that fit to transform both the training and test sets.

In [None]:
# Fit the scaler on just the training X variables
# Let's start with StandardScaler which will center
# each variable at 0 and give each a unit variance (=1)
s_scaler = StandardScaler().fit(X_train)
s_scaler

In [None]:
# Transform X_train and put in DataFrame
X_train_ss = pd.DataFrame(s_scaler.transform(X_train), columns=X_train.columns)

# Take a look at the DataFrame
X_train_ss.describe()

In [None]:
# Transform X_test and put in DataFrame
X_test_ss = pd.DataFrame(s_scaler.transform(X_test), columns=X_test.columns)
X_test_ss.describe()

# Function to Look at Different Models

I have write a custom function that will take in different models (classifiers) and compute the different metrics so we can compare the different classifiers easily.

In [None]:
# define our function
def modelMetrics(classifier, name, X_test, y_test):
    """
    We want to see how the different models react to the same 
    dataset. We should capture multiple metrics for each model.
    
    classifier: the classifier we are capturing metrics for
    
    name: give it a descriptive name
    
    X_test: the X array for the test set
    
    y_test: the output variable (actual) for test set
    """
    retVal = {}
    
    metrics = {}
    predictions = classifier.predict(X_test)
    metrics['a_score'] = accuracy_score(y_test, predictions)
    metrics['r_score'] = recall_score(y_test, predictions)
    metrics['p_score'] = precision_score(y_test, predictions)
    metrics['f1_score'] = f1_score(y_test, predictions)
    metrics['f2_score'] = fbeta_score(y_test, predictions, beta=2)
    metrics['f0.5_score'] = fbeta_score(y_test, predictions, beta=0.5)
    
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    totPositives = y_test.sum()
    totNegatives = len(y_test) - totPositives
    
    # Error rate negatives = false positives / total negatives
    metrics['errorNegatives'] = fp/totNegatives
    # Accuracy for negatives = true negatives / total negatives
    metrics['accNegatives'] = tn/totNegatives
    # Error rate for positives = false negatives / total positives
    metrics['errorPositives'] = fn/totPositives
    # Accuracy for positives = true positives / total positives
    metrics['accPositives'] = tp/totPositives

    metrics['roc_auc_score'] = roc_auc_score(y_test,
                                             classifier.predict_proba(X_test)[:,1])
    metrics['avg_p_score'] = average_precision_score(y_test,
                                                     classifier.predict_proba(X_test)[:,1])
    
    retVal[name] = metrics
    
    return pd.DataFrame(retVal)

# Create a Logistic Model

We will use the scaled data and include all the features.

In [None]:
# Create a LogisticRegression
logReg = LogisticRegression()

# fit the logistic regression model
logReg.fit(X_train_ss, y_train)

In [None]:
# Print out the estimated intercept and coefficients
print(logReg.intercept_)
print(logReg.coef_)

In [None]:
# Get the model metrics using user-defined function
lr_metrics = modelMetrics(logReg, 'logistic regression', X_test_ss, y_test)
lr_metrics

In [None]:
# Let's make a plot of confusion matrix
ConfusionMatrixDisplay.from_estimator(logReg, X_test_ss, y_test, cmap='cividis')

In [None]:
# Create an ROC Curve display
RocCurveDisplay.from_estimator(logReg, X_test_ss, y_test)

In [None]:
# Make a Precision Recall display
PrecisionRecallDisplay.from_estimator(logReg, X_test_ss, y_test)

## Logistic Regression CV

We can use `LogisticRegressionCV` to regularize the coefficients on the input variables to attempt to get a better model.

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
logRegCV = LogisticRegressionCV().fit(X_train_ss, y_train)

In [None]:
logRegCV_metrics = modelMetrics(logRegCV, 'logistic regression CV', X_test_ss, y_test)
logRegCV_metrics

## Recursive Feature Elimination

We can also use `RFECV` on a `LogisticRegression` to find the smallest set of input variables for a model. Let's try it with logistic regression

In [None]:
logReg_rfecv = LogisticRegression()
rfecv = RFECV(logReg_rfecv)

In [None]:
logReg_rfecv.fit(rfecv.fit_transform(X_train_ss, y_train), y_train)

In [None]:
rfecv.n_features_

In [None]:
rfecv.ranking_

In [None]:
X_train_ss.columns

In [None]:
ConfusionMatrixDisplay.from_estimator(rfecv, X_test_ss, y_test)

In [None]:
rfe_metrics = modelMetrics(rfecv, 'RFE logistic regression', X_test_ss, y_test)
rfe_metrics

# Linear Discriminant Analysis
Let's now try LDA. We already imported the appropriate packages for LDA.

In [None]:
# Create an LDA instance
lda = LinearDiscriminantAnalysis()

In [None]:
# Fit the LDA model with X and y training set created previously
lda.fit(X_train_ss, y_train)

In [None]:
# Get the model metrics using user-defined function
lda_metrics = modelMetrics(lda, 'LDA', X_test_ss, y_test)
lda_metrics

# Quadratic Discriminant Analysis
We can also try QDA (already imported package)

In [None]:
# Create a QDA instance
qda = QuadraticDiscriminantAnalysis()

In [None]:
# Fit the qda with X and y
qda.fit(X_train_ss, y_train)

In [None]:
# Get the model metrics using user-defined function
qda_metrics = modelMetrics(qda, 'QDA', X_test_ss, y_test)
qda_metrics

# KNN
Might as well try $k$-nearest neighbors. Let's use $k=3$.

In [None]:
# Let's use k=k
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
# fit the model
knn.fit(X_train_ss, y_train)

In [None]:
# Get the model metrics using user-defined function
knn3_metrics = modelMetrics(knn, 'KNN-3', X_test_ss, y_test)
knn3_metrics

## A Simple Decision Tree

In [None]:
dt = DecisionTreeClassifier().fit(X_train_ss, y_train)

In [None]:
dt_metrics = modelMetrics(dt, 'Decision Tree', X_test_ss, y_test)
dt_metrics

In [None]:
fig = plt.figure(figsize=(25, 20))
plot_tree(dt, feature_names=X.columns, filled=True, rounded=True);

In [None]:
dt3 = DecisionTreeClassifier(max_depth=3, random_state=42)
dt3.fit(X_train_ss, y_train)

In [None]:
fig = plt.figure(figsize=(25, 20))
plot_tree(dt3, feature_names=X.columns, filled=True, rounded=True);

In [None]:
dt3_metrics = modelMetrics(dt3, 'Decision Tree - 3', X_test_ss, y_test)
dt3_metrics

In [None]:
importance = pd.DataFrame({'Importance':dt3.feature_importances_*100},
                          index=X.columns)
importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh')
plt.xlabel('Variable Importance')
plt.gca().legend_ = None

## Bagging

In [None]:
bag = BaggingClassifier(random_state=42).fit(X_train_ss, y_train)

In [None]:
bag_metrics = modelMetrics(bag, 'Bagging', X_test_ss, y_test)
bag_metrics

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42).fit(X_train_ss, y_train)

In [None]:
rf_metrics = modelMetrics(rf, 'Random Forest', X_test_ss, y_test)
rf_metrics

## Extra Trees (Extremely Randomized Trees)

In [None]:
xtClf = ExtraTreesClassifier(random_state=42).fit(X_train_ss, y_train)

In [None]:
xt_metrics = modelMetrics(xtClf, 'Extra Trees', X_test_ss, y_test)
xt_metrics

## Adaptive Boosting

In [None]:
ada = AdaBoostClassifier(random_state=42, n_estimators=500)
ada.fit(X_train_ss, y_train)

In [None]:
ada_metrics = modelMetrics(ada, 'Adaptive Boosting', X_test_ss, y_test)
ada_metrics

## Gradient Tree Boosting

In [None]:
gboost = GradientBoostingClassifier(random_state=42, n_estimators=500, learning_rate=0.01)
gboost.fit(X_train_ss, y_train)

In [None]:
gboost_metrics = modelMetrics(gboost, 'Gradient Boosting', X_test_ss, y_test)
gboost_metrics

## Histogram-based Gradient Boosting

In [None]:
hgb = HistGradientBoostingClassifier(random_state=42, learning_rate=0.01, max_iter=500)
hgb.fit(X_train_ss, y_train)

In [None]:
hgb_metrics = modelMetrics(hgb, 'Hist Gradient Boosting', X_test_ss, y_test)
hgb_metrics

## Extreme Gradient Boosting (XG Boost)

We will probably need to install it first

In [None]:
import xgboost as xgb

In [None]:
xgbClf = xgb.XGBClassifier()

In [None]:
xgbClf.fit(X_train_ss, y_train)

In [None]:
xgb_metrics = modelMetrics(xgbClf, 'XGB', X_test_ss, y_test)
xgb_metrics

In [None]:
hgb = HistGradientBoostingClassifier(random_state=42, learning_rate=0.01, max_iter=500)
hgb.fit(X_train_ss, y_train)

In [None]:
hgb_metrics = modelMetrics(hgb, 'Hist Gradient Boosting', X_test_ss, y_test)
hgb_metrics

In [None]:
# Put all metrics in one DataFrame to examine
all_dfs = [lr_metrics, lda_metrics, qda_metrics, knn3_metrics,
           dt_metrics, dt3_metrics, bag_metrics, rf_metrics,
           xt_metrics, ada_metrics, gboost_metrics, hgb_metrics, xgb_metrics]

In [None]:
# Look at all metrics together
all_metrics = pd.concat(all_dfs, axis=1)
all_metrics

In [None]:
print(all_metrics.to_markdown())

In [None]:
# The scope of these changes made to
# pandas settings are local to with statement.
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(all_metrics)

In [None]:
all_metrics.to_csv('all_metrics.csv')