In [1]:
## working on finalizing a model given reduced set of data columns.
from datetime import datetime, timedelta
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy.stats as stats
import seaborn as sns
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline

import networkx as nx
from networkx.algorithms.components.connected import connected_components

%matplotlib notebook
%timeit

#show up to 100 columns.
pd.set_option('display.max_columns', 100, 'display.max_rows', 200)

In [2]:
## load pickle file with reduced data columns
file_name = 'theorem_reduced_2.pkl'
reduced_data = pd.read_pickle(file_name)

In [3]:
reduced_data.head()

Unnamed: 0_level_0,Cancelled,BoolPriorProsperLoanee,DolLoanAmountRequested,BorrowerRate,NumMonthsTerm,DolMonthlyIncome,FracDebtToIncomeRatio,StrEmploymentStatus,NumMonthsEmployed,NumPriorProsperLoansLatePayments,NumPriorProsperLoans61dpd,BoolIsLender,BoolInGroup,NumTradesOpened6,NumOpenTradesDelinqOrPastDue6,DolTotalBalanceOnPublicRecords,NumRealEstateTrades,DolMonthlyDebt,NumCurrentDelinquencies,NumPublicRecordsLast10Years,NumPublicRecords12,DolAmountDelinquent,PctBankcardUtil,NumTotalInquiries,DaysSinceFirstCredit
ListingNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
973605,1,False,15000.0,0.162,60,6000.0,0.27,Employed,445.0,,,0,False,1,0,0,2,1242,0,0,0,0,0.97,5,13901
981099,1,False,15000.0,0.1585,60,7916.6667,0.35,Other,32.0,,,0,False,1,0,0,2,2289,0,0,0,0,0.48,3,14238
1025766,0,True,4000.0,0.2085,36,2083.3333,0.53,Employed,4.0,0.0,,0,False,0,0,0,0,911,0,0,0,0,0.93,5,4146
1003835,1,False,10000.0,0.1299,36,3750.0,0.14,Employed,2.0,,,0,False,1,0,0,0,223,0,0,0,0,0.26,1,2942
1011335,0,True,20000.0,0.144,60,9000.0,0.16,Employed,90.0,0.0,,0,False,1,0,1249,1,1264,1,2,0,0,0.81,17,8329


In [4]:
## should booleans be converted with standard scaler?
m1 = linear_model.LogisticRegression()
x1 = reduced_data['BoolPriorProsperLoanee'].to_frame()
y = reduced_data['Cancelled']
m1.fit(x1,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
m2 = linear_model.LogisticRegression()
standard_scaler = preprocessing.StandardScaler()
x2 = standard_scaler.fit_transform(x1)
m2.fit(x2,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
m3 = linear_model.LogisticRegression()
robust_scaler = preprocessing.RobustScaler()
x3 = robust_scaler.fit_transform(x1)
m3.fit(x3,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
print(m1.coef_)
print(m2.coef_)
print(m3.coef_)
print(m1.intercept_)
print(m2.intercept_)
print(m3.intercept_)

[[-1.24148665]]
[[-0.30515356]]
[[-1.24148665]]
[-0.62693258]
[-0.7070492]
[-0.62693258]


In [14]:
print(m1.predict_proba(x1.min()))
print(m1.predict_proba(x1.max()))
print(m2.predict_proba(x2.min()))
print(m2.predict_proba(x2.max()))
print(m3.predict_proba(x3.min()))
print(m3.predict_proba(x3.max()))

[[ 0.65179361  0.34820639]]
[[ 0.86627526  0.13372474]]
[[ 0.65178852  0.34821148]]
[[ 0.86634409  0.13365591]]
[[ 0.65179361  0.34820639]]
[[ 0.86627526  0.13372474]]




A sanity check: Using RobustScaler and StandardScaler on a boolean variable makes a very minimal difference.

In [101]:
## let's try throwing in two variables and see what happens.
mm = linear_model.LogisticRegression()
standard_scaler = preprocessing.StandardScaler()
robust_scaler = preprocessing.RobustScaler()
x = reduced_data[['BoolPriorProsperLoanee','DolLoanAmountRequested']]
x_s = standard_scaler.fit_transform(x)
y = reduced_data['Cancelled']
mm.fit(x_s,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [102]:
mm.coef_

array([[-0.30160269,  0.15214037]])

In [104]:
xx, yy = np.mgrid[-4:4:.01, -4:4:.01]
grid = np.c_[xx.ravel(), yy.ravel()]
probs = mm.predict_proba(grid)[:, 1].reshape(xx.shape)

In [105]:
f, ax = plt.subplots(figsize=(8, 6))
contour = ax.contourf(xx, yy, probs, 25, cmap="RdBu",
                      vmin=0, vmax=1)
ax_c = f.colorbar(contour)
ax_c.set_label("$P(y = 1)$")
ax_c.set_ticks([0, .25, .5, .75, 1])

ax.scatter(x_s[100:,0], x_s[100:, 1], c=y[100:], s=50,
           cmap="RdBu", vmin=-.2, vmax=1.2,
           edgecolor="white", linewidth=1)

ax.set(aspect="equal",
       xlim=(-4, 4), ylim=(-4, 4),
       xlabel="$X_1$", ylabel="$X_2$")

<IPython.core.display.Javascript object>

[(-4, 4),
 <matplotlib.text.Text at 0x11bb56e48>,
 (-4, 4),
 <matplotlib.text.Text at 0x11b457ba8>,
 None]

Plot above reveals that when first-time loanees request large loans, they are more likely to wind up canceling.

In [None]:
## let's start plotting some ROC curves and see if adding variables is improving our skill.

In [132]:
variables = [['DolLoanAmountRequested'],['BoolPriorProsperLoanee','DolLoanAmountRequested'],\
       ['BoolPriorProsperLoanee','DolLoanAmountRequested','PctBankcardUtil'],\
            ['BoolPriorProsperLoanee','DolLoanAmountRequested','PctBankcardUtil','NumTotalInquiries']]

y = reduced_data['Cancelled']
X = []
X_train = []
X_test = []
y_train = []
y_test = []

fig, ax = plt.subplots()
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')

for i,v in enumerate(variables):
    standard_scaler = preprocessing.StandardScaler()
    X.append(standard_scaler.fit_transform(reduced_data[v]))
    X_tr, X_te, y_tr, y_te = train_test_split(X[i], y, test_size=0.3, random_state=0)
    logm = linear_model.LogisticRegression()
    logm.fit(X_tr,y_tr)
    probs = logm.predict_proba(X_te)
    fpr, tpr, thresholds = metrics.roc_curve(y_te,probs[:,1])
    plt.plot(fpr,tpr)
    plt.legend()
    print(logm.coef_)

<IPython.core.display.Javascript object>



[[ 0.16317715]]
[[-0.30253921  0.15834587]]
[[-0.30678644  0.16329316 -0.1515659 ]]
[[-0.30796493  0.16270451 -0.14653551  0.04711068]]


Pretty cool - this series of ROC curves shows that starting with DolLoanAmountRequested, then adding BoolPriorProsperLoanee, then PctBankcardUtil, all make successive contributions to quality of fit.

On the other hand, adding NumTotalInquiries makes no perceptable impact on the resulting ROC curve.

In [None]:
## 