In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Logistic Regression and Imbalanced Classes

## Abalone Data

In [None]:
df = pd.read_csv('abalone19.csv')
df.head()

The target variable is 'Class'. Check the distribution of values

In [None]:
df.Class.value_counts()

Let's make 'Class' a boolean variable instead of a string.

In [None]:
df['Class'] = df.Class == ' positive'

Keep track of different variable types

In [None]:
continuous = [
    'Length',
    'Diameter',
    'Whole_weight',
    'Shucked_weight',
    'Viscera_weight',
    'Shell_weight'
]
discrete = [
    'Sex'
]
predictors = continuous + discrete
target = 'Class'

## Train/Test Split

This time, let's separate X from y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[target], random_state=2)

## Feature Engineering

### Normalize the continuous features

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(X_train[continuous])
X_train[continuous] = ss.transform(X_train[continuous])
X_test[continuous] = ss.transform(X_test[continuous])

### Binarize the categorical column

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False) 
X_train = pd.DataFrame(
    dv.fit_transform(X_train.to_dict(orient='records')),
    columns = dv.feature_names_
    )

X_test = pd.DataFrame(
    dv.transform(X_test.to_dict(orient='records')),
    columns = dv.feature_names_
)

In [None]:
X_train.head()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

### Accuracy on test set

In [None]:
lr.score(X_test, y_test)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

pred = lr.predict(X_test)
c = pd.DataFrame(
        confusion_matrix(pred, y_test), 
        columns=['Actual=0', 'Actual=1'], 
        index=['Predicted=0', 'Predicted=1']
)

print c

tpr = c.loc['Predicted=1', 'Actual=1'] / c.loc[:, 'Actual=1'].sum()
fpr = c.loc['Predicted=1', 'Actual=0'] / c.loc['Predicted=1', :].sum()

print
print
print "True positive rate (detection rate): {}".format(tpr)
print "False positive rate (false alarm rate): {}".format(fpr)

### Closer look at model

In [None]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

# Inverse of logistic
def logit(p):
    return np.log(p / (1 - p))

# Full spectrum of model behavior
pltdf = pd.DataFrame({
        'Class': y_test,
        'pred': lr.predict_proba(X_test)[:, 1]
    })
pltdf['logit'] = pltdf.pred.apply(logit)
logits = np.linspace(-6, 6, 200)
probs = logistic(logits)

# Plot
ax = pd.DataFrame({'probs': probs}, index=logits).plot()
pltdf.plot(x='logit', y='Class', kind='scatter', ax=ax, label='Actual')
plt.legend(loc='lower right')
plt.axvline(x=0, color='red')
plt.axhline(y=logistic(0), color='purple')

#### Threshold: p >= 0.01

In [None]:
# ax = plt.plot(logits, probs, label='Predicted')
ax = pd.DataFrame({'probs': probs}, index=logits).plot()
pltdf.plot(x='logit', y='Class', kind='scatter', ax=ax, label='Actual')
plt.legend(loc='lower right')
plt.axvline(x=logit(.01), color='red')
plt.axhline(y=.01, color='purple')

In [None]:
pred = lr.predict_proba(X_test)[:, 1] >= .01
c = pd.DataFrame(
        confusion_matrix(pred, y_test), 
        columns=['Actual=0', 'Actual=1'], 
        index=['Predicted=0', 'Predicted=1']
)

print c

tpr = c.loc['Predicted=1', 'Actual=1'].astype(float) / c.loc[:, 'Actual=1'].sum()
fpr = c.loc['Predicted=1', 'Actual=0'].astype(float) / c.loc['Predicted=1', :].sum()

print
print
print "True positive rate (detection rate): {:.3}".format(tpr)
print "False positive rate (false alarm rate): {:.3}".format(fpr)

## ROC Curve

Each point on the ROC curve is one confusion matrix, defined by one prediction threshold.

In [None]:
from sklearn.metrics import roc_curve
pred = lr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)

plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

### Area under the Curve

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, pred)

## Try some other models

### Add polynomial coefficients

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV

poly = PolynomialFeatures(degree=3)
lrcv = LogisticRegressionCV(scoring='roc_auc')


from sklearn.pipeline import Pipeline
polylr = Pipeline([('poly', poly), ('logistic', lrcv)])

polylr.fit(X_train, y_train)

pred = polylr.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, pred)

fpr, tpr, thresh = roc_curve(y_test, pred)
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

### Gradient Boosted Decision Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

pred = gb.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, pred)

fpr, tpr, thresh = roc_curve(y_test, pred)
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')

## Compare all three models

In [None]:
# Standard logistic regression
pred = lr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
standard_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Standard Logistic')

# Polynomial logistic regression
pred = polylr.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
poly_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Polynomial Logistic')

# Random Forest
pred = gb.predict_proba(X_test)[:, 1]
fpr, tpr, thresh = roc_curve(y_test, pred)
rf_auc = roc_auc_score(y_test, pred)
plt.plot(fpr, tpr, label = 'Random Forest')

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(loc='lower right')

print "Standard AUC: {:.3}".format(standard_auc)
print "Polynomial AUC: {:.3}".format(poly_auc)
print "Random Forest AUC: {:.3}".format(rf_auc)