In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

%matplotlib inline

# Logistic Regression and Imbalanced Classes

## Data

In [None]:
df = pd.read_csv('abalone19.csv')

The target variable is 'Class'. Check the distribution of values

Let's make 'Class' a boolean variable instead of a string.

Keep track of different variable types

In [None]:
continuous = [
    'Length',
    'Diameter',
    'Whole_weight',
    'Shucked_weight',
    'Viscera_weight',
    'Shell_weight'
]
discrete = [
    'Sex'
]
predictors = continuous + discrete
target = 'Class'

## Logistic Regression

#### Train/Test Split

This time, let's separate X from y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[target], random_state=2)

### Feature Engineering

#### Normalize the continuous features

Wrong way:

In [None]:
X_train_normalized = (X_train[continuous] - X_train[continuous].mean()) / X_train[continuous].std()
X_test_normalized = (X_test[continuous] - X_test[continuous].mean()) / X_test[continuous].std()

Correct way:

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
# Compute mean and std of training data
# Use that mean and std to normalize columns of training data
# Use _the same_ mean and std to normalize columns of test data

#### Binarize the categorical column

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False) 
X_train = pd.DataFrame(
    dv.fit_transform(X_train.to_dict(orient='records')),
    columns = dv.feature_names_
    )

X_test = pd.DataFrame(
    dv.transform(X_test.to_dict(orient='records')),
    columns = dv.feature_names_
)

Train the logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

Compute the accuracy on the test set

Look at a confusion matrix to get a better idea how well our classifier is doing

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve

pred = lr.predict(X_test)
c = pd.DataFrame(
        confusion_matrix(y_test, pred), 
        columns=['Predicted=0', 'Predicted=1'], 
        index=['Actual=0', 'Actual=1']
)

print c

tpr = c.loc['Actual=1', 'Predicted=1'].astype(float) / c.loc['Actual=1', :].sum()
fpr = c.loc['Actual=0', 'Predicted=1'].astype(float) / c.loc['Actual=0', :].sum()

print
print
print "True positive rate (detection rate): {:.3}".format(tpr)
print "False positive rate (false alarm rate): {:.3}".format(fpr)

Closer look at model behavior

In [None]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

# Inverse of logistic
def logit(p):
    return np.log(p / (1 - p))

# Full spectrum of model behavior
pltdf = pd.DataFrame({
        'Class': y_test,
        'pred': lr.predict_proba(X_test)[:, 1]
    })
pltdf['logit'] = pltdf.pred.apply(logit)
logits = np.linspace(-6, 6, 200)
probs = logistic(logits)

# Plot
ax = pd.DataFrame({'probs': probs}, index=logits).plot()
pltdf.plot(x='logit', y='Class', kind='scatter', ax=ax, label='Actual')
plt.legend(loc='lower right')
plt.axvline(x=0, color='red')
plt.axhline(y=logistic(0), color='purple')

#### Threshold: p >= 0.01

In [None]:
# ax = plt.plot(logits, probs, label='Predicted')
ax = pd.DataFrame({'probs': probs}, index=logits).plot()
pltdf.plot(x='logit', y='Class', kind='scatter', ax=ax, label='Actual')
plt.legend(loc='lower right')
plt.axvline(x=logit(.01), color='red')
plt.axhline(y=.01, color='purple')

In [None]:
pred = lr.predict_proba(X_test)[:, 1] >= .01
c = pd.DataFrame(
        confusion_matrix(y_test, pred), 
        columns=['Predicted=0', 'Predicted=1'], 
        index=['Actual=0', 'Actual=1']
)

print c

tpr = c.loc['Actual=1', 'Predicted=1'].astype(float) / c.loc['Actual=1', :].sum()
fpr = c.loc['Actual=0', 'Predicted=1'].astype(float) / c.loc['Actual=0', :].sum()

print
print
print "True positive rate (detection rate): {:.3}".format(tpr)
print "False positive rate (false alarm rate): {:.3}".format(fpr)

### ROC Curve

In [None]:
from sklearn.metrics import roc_curve
pred = lr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)

plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

## Find optimal model

### Add polynomial coefficients

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
poly.fit(X_train)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline


poly = PolynomialFeatures(degree=2)
lrcv = LogisticRegressionCV(scoring='roc_auc')

polylr = Pipeline([('poly', poly), ('logistic', lrcv)])
polylr.fit(X_train, y_train)

pred = polylr.predict_proba(X_test)[:, 1]
print roc_auc_score(y_test, pred)

fpr, tpr, thresh = roc_curve(y_test, pred)
_ = plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

### Gradient Boosted Decision Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

pred = gb.predict_proba(X_test)[:, 1]
print roc_auc_score(y_test, pred)

fpr, tpr, thresh = roc_curve(y_test, pred)
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

## Compare all three models

In [None]:
# Standard logistic regression
pred = lr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
standard_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Standard Logistic')

# Polynomial logistic regression
pred = polylr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
poly_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Polynomial Logistic')

# Gradient Boosting
pred = gb.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
rf_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Gradient Boosting')

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')

print "Standard AUC: {:.3}".format(standard_auc)
print "Polynomial AUC: {:.3}".format(poly_auc)
print "Gradient Boosting AUC: {:.3}".format(rf_auc)