# Classification

When we are trying to predict if something will happen or not, we have entered the realm of **classification**. In this example, we are going to look at only two possible outcome classes from our customer dataset we have been using: "big spender" or "not big spender". The data is the file `./data/customers_clean.csv`.

In [None]:
# Import our most-used packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split

# We'll use a DummyClassifier for fun
from sklearn.dummy import DummyClassifier

# Logistic Regression modules
import statsmodels.formula.api as smf
from sklearn.linear_model import LogisticRegression

# Need to measure "goodness"
# Need to measure "goodness"
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, recall_score, average_precision_score
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay

# Scaling
from sklearn.preprocessing import StandardScaler

In [None]:
# Read in the data and print out its shape
cust = pd.read_csv('./data/customers_clean.csv')
print(cust.shape)

In [None]:
# See summary statistics
cust.describe()

In [None]:
# Let's drop the following columns:
# cust_id, join_date, last_purchase_date
new_cust = cust.drop(columns=['cust_id','join_date','last_purchase_date'])
new_cust.info()

In [None]:
# Create dummies and save in new DataFrame
data = pd.get_dummies(new_cust, dtype=int, drop_first=True)
data.info()

## Create "big spender"

We want to convert the $y$ variable, `spend`, into a binary variable where 1 represents a big spender and a 0 otherwise. We can set the cutoff anywhere we want. Looking at the summary statistics from above, let's use \$5,700 as the cutoff. We can use the function `pd.cut()`.

In [None]:
# Try it to see how many will be "big spenders"
pd.cut(data.spend, bins=[0,5700,10000], right=True).value_counts()

In [None]:
data['big_spender'] = pd.cut(data.spend, bins=[0,5700,10000], right=True, labels=[0,1]).astype(int)
data.info()

In [None]:
# See summary statistics
data.describe()

In [None]:
# Let's try something crazy ... let's fit an OLS model to the data
# Plot it to see what it looks like
#
# Plot household_income on x-axis and big_spender on the y-axis, add regression line
sns.regplot(x='household_income', y='big_spender', data=data, ci=None)

In [None]:
# Visually we saw issues with the OLS fitted line
# This is why we use logistic regression
# There is a logit() function statsmodels.formula.api
logitResults = smf.logit('big_spender ~ household_income', data=data).fit()
logitResults.summary()

In [None]:
# Output at top different than OLS
# Coefficients table useful for p-values and for predictions
# Let's plot the logistic regression model
# Turn OFF the confidence interval, otherwise it take time to run
sns.lmplot(x='household_income', y='big_spender', data=data,
          logistic=True, ci=None)

## Split Data into Training and Test Sets

Need to first define `X` and `y`. Then we can try train/test split.

In [None]:
# define the output variable, y
y = data.big_spender

# define the X
X = data.drop(columns=['spend', 'big_spender', 'age', 'num_children', 'num_vehicles'])

In [None]:
# Time to split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=163)

In [None]:
# What is the percentage of big spenders in both training and test?
print(f'Training percentage of big spenders is {y_train.mean():.2%}')
print(f'Testing percentage of big spenders is  {y_test.mean():.2%}')

In [None]:
# Fit the scaler on just the training X variables
# Let's start with StandardScaler which will center
# each variable at 0 and give each a unit variance (=1)
s_scaler = StandardScaler().fit(X_train)
s_scaler

In [None]:
# Transform X_train and put in DataFrame
X_train_ss = pd.DataFrame(s_scaler.transform(X_train), columns=X_train.columns)

In [None]:
# Take a look at the DataFrame
X_train_ss.describe()

In [None]:
# Transform X_test and put in DataFrame
X_test_ss = pd.DataFrame(s_scaler.transform(X_test), columns=X_test.columns)
X_test_ss.describe()

# DummyClassifier

Let's create a `DummyClassifier` where we will predict the most frequently occurring class, **not** a big spender. How good do you expect this model to be?

In [None]:
# Create a DummyClassifier with constant strategy
dummy = DummyClassifier(strategy='constant', constant=0)
# Fit it to the training set
dummy.fit(X_train_ss, y_train)

In [None]:
# Call predict on the test set
dummy.predict(s_scaler.transform(X_test))

In [None]:
# Can 'score' the dummy classifier for test set
dummy.score(X_test_ss, y_test)

In [None]:
# Create a confusion matrix display
ConfusionMatrixDisplay.from_estimator(dummy, X_test_ss, y_test, cmap='cividis')

In [None]:
# Create an ROC Curve display
RocCurveDisplay.from_estimator(dummy, X_test_ss, y_test)

In [None]:
# Create a Precision Recall Display
PrecisionRecallDisplay.from_estimator(dummy, X_test_ss, y_test)

# Create a Logistic Model

Let's first create a logistic model with no scaling. We will then scale the data later and see if it helps with prediction.

In [None]:
# Create a LogisticRegression
logReg = LogisticRegression()

In [None]:
# fit the logistic regression model
logReg.fit(X_train_ss, y_train)

In [None]:
# Print out the estimated intercept and coefficients
print(logReg.intercept_)
print(logReg.coef_)

In [None]:
# Make probability predictions for test set
logReg.predict_proba(X_test_ss)

In [None]:
# We can also get the predicted class for test set
logReg.predict(X_test_ss)

In [None]:
# Print out the confusion matrix
confusion_matrix(y_test, logReg.predict(X_test_ss))

In [None]:
# Let's make a plot of confusion matrix
ConfusionMatrixDisplay.from_estimator(logReg, X_test_ss, y_test, cmap='cividis')

In [None]:
# We can "unravel" the values in confusion matrix with .ravel()
trueNeg, falsePos, falseNeg, truePos = confusion_matrix(y_test, logReg.predict(X_test_ss)).ravel()

print(f'trueNeg : {trueNeg:>4}')
print(f'falsePos: {falsePos:>4}')
print(f'falseNeg: {falseNeg:>4}')
print(f'truePos : {truePos:>4}')

In [None]:
# Let's first look at the overall error rate and accuracy
# Overall error rate = total misclassifications / total chances (sample size)
print(f'Overall Error Rate: {(falseNeg+falsePos)/len(y_test):>6.2%}')
print(f'Overall Accuracy  : {(trueNeg+truePos)/len(y_test):>6.2%}')

In [None]:
# Let's look at error rates
# Find the number of big spenders and others
bigSpenders = y_test.sum()
others = len(y_test) - bigSpenders
print(f'# of big spenders: {bigSpenders}')
print(f'# of others      : {others}')

In [None]:
# How good are we with others? We misclassified 70 others
# Error rate = false positives / total non-defaulters
print(f'Error rate for others: {falsePos/others:>6.2%}')
# Accuracy for others = true negatives / total non-defaulters
print(f'Accuracy for others  : {trueNeg/others:>6.2%}')

In [None]:
# What about big spenders?
# Error rate for big spenders = false negatives / total defaulters
print(f'Error rate for big spenders: {falseNeg/bigSpenders:>6.2%}')
# Accuracy for defaulters = true positives / total defaulters
print(f'Accuracy for big spenders  : {truePos/bigSpenders:>6.2%}')

In [None]:
# Print the classification_report
print(classification_report(y_test, logReg.predict(X_test_ss), target_names=['No','Yes']))

In [None]:
# Create an ROC Curve display
RocCurveDisplay.from_estimator(logReg, X_test_ss, y_test)

In [None]:
# Make a Precision Recall display
PrecisionRecallDisplay.from_estimator(logReg, X_test_ss, y_test)