In [68]:
## working on finalizing a model given reduced set of data columns.
from datetime import datetime, timedelta
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import scipy.stats as stats
import seaborn as sns
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import make_pipeline

import networkx as nx
from networkx.algorithms.components.connected import connected_components

%matplotlib notebook
%timeit

#show up to 100 columns.
pd.set_option('display.max_columns', 100, 'display.max_rows', 200)

In [10]:
## load pickle file with reduced data columns
file_name = 'theorem_reduced.pkl'
reduced_data = pd.read_pickle(file_name)

In [11]:
reduced_data.head()

Unnamed: 0_level_0,DolLoanAmountRequested,BoolPartialFundingApproved,BorrowerRate,NumMonthsTerm,EnumListingCategory,DolMonthlyIncome,FracDebtToIncomeRatio,StrEmploymentStatus,StrOccupation,NumMonthsEmployed,NumPriorProsperLoansLatePayments,NumPriorProsperLoans61dpd,BoolIsLender,BoolInGroup,EnumChannelCode,NumTradesOpened6,NumOpenTradesDelinqOrPastDue6,DolTotalBalanceOnPublicRecords,NumRealEstateTrades,DolMonthlyDebt,NumCurrentDelinquencies,NumPublicRecordsLast10Years,NumPublicRecords12,DolAmountDelinquent,PctBankcardUtil,NumTotalInquiries,BoolEverWholeLoan,DaysSinceFirstCredit,Cancelled,BoolPriorProsperLoanee
ListingNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
973605,15000.0,True,0.162,60,1,6000.0,0.27,Employed,Tradesman - Mechanic,445.0,,,0,False,70000,1,0,0,2,1242,0,0,0,0,0.97,5,True,13901,1,False
981099,15000.0,True,0.1585,60,1,7916.6667,0.35,Other,,32.0,,,0,False,70000,1,0,0,2,2289,0,0,0,0,0.48,3,True,14238,1,False
1025766,4000.0,True,0.2085,36,1,2083.3333,0.53,Employed,Professional,4.0,0.0,,0,False,80000,0,0,0,0,911,0,0,0,0,0.93,5,False,4146,0,True
1003835,10000.0,True,0.1299,36,13,3750.0,0.14,Employed,Medical Technician,2.0,,,0,False,90000,1,0,0,0,223,0,0,0,0,0.26,1,True,2942,1,False
1011335,20000.0,True,0.144,60,1,9000.0,0.16,Employed,Executive,90.0,0.0,,0,False,80000,1,0,1249,1,1264,1,2,0,0,0.81,17,False,8329,0,True


In [12]:
## 18 numerical variables only - show plot for each one. Also include column for cancellation.
numerics = reduced_data[['DolLoanAmountRequested','BorrowerRate','NumMonthsTerm','DolMonthlyIncome','FracDebtToIncomeRatio',\
'NumMonthsEmployed','NumPriorProsperLoansLatePayments',\
'NumTradesOpened6','NumOpenTradesDelinqOrPastDue6','DolTotalBalanceOnPublicRecords','NumRealEstateTrades',\
'DolMonthlyDebt','NumCurrentDelinquencies','NumPublicRecords12','DolAmountDelinquent','PctBankcardUtil',\
'NumTotalInquiries','DaysSinceFirstCredit','Cancelled']]

## 5 boolean variables
booleans = reduced_data[['BoolPartialFundingApproved','BoolIsLender','BoolInGroup','BoolEverWholeLoan','BoolPriorProsperLoanee']]

## 4 categorical variables left: EnumListingCategory, StrEmploymentStatus, StrOccupation, EnumChannelCode

In [16]:
## create one big figure behind subpanels
fig, axes = plt.subplots(6,3,figsize=(12,16), facecolor='w')

## cycle through each numeric variable and logistic regress with cancellation (excluding cancellation itself)
for ax, var in zip(axes.reshape(-1),numerics.columns.values):
    sns.regplot(ax=ax, x=var, y="Cancelled", data=reduced_data, logistic=True, y_jitter=.03, ci = None, x_bins = 500)
    
## LOOKS SIGNIFICANT: DolLoanAmountRequested, DolMonthlyIncome,FracDebtToIncomeRatio,NumMonthsEmployed,NumTradesOpened6,
## NumOpenTradesDelinqOrPastDue6, NumRealEstateTrades, DolMonthlyDebt, NumCurrentDelinquencies, PctBankcardUtil,
## NumTotalInquiries

## DaysSinceFirstCredit has notable parabolic behavior in probability space - may actually be more useful to convert
## to categorical behavior. Actually, same with NumMonthsEmployed?

## NumPriorProsperLoans is a TREMENDOUS predictor of not cancelling - if you've been through the process before, much
## easier to follow through. Not only that, but it should pretty clearly be a boolean cutoff variable - 

<IPython.core.display.Javascript object>

In [17]:
## create one big figure behind subpanels
fig, axes = plt.subplots(3,2,figsize=(8,10), facecolor='w')

## cycle through each numeric variable and logistic regress with cancellation (excluding cancellation itself)
for ax, var in zip(axes.reshape(-1), booleans.columns.values):
    sns.regplot(ax=ax, x=var, y="Cancelled", data=reduced_data, logistic=True, ci = None, x_bins = 500)
    
## CONCLUSION: BoolPriorProsperLoanee is highly significant, BoolIsLender and BoolInGroup significant, 
## BoolPartialFundingApproved and BoolEverWholeLoan not.

<IPython.core.display.Javascript object>

In [27]:
## 4 categorical variables left: EnumListingCategory, StrEmploymentStatus, StrOccupation, EnumChannelCode
# let's build a dummy variable for each, then see what the associated plots look like...although may have to take a
# different approach for occupation.
enum_listing_dummies = pd.get_dummies(reduced_data['StrEmploymentStatus'])
df = pd.concat([enum_listing_dummies, reduced_data['Cancelled']], axis =1)
fig, axes = plt.subplots(3,2,figsize=(12,10), facecolor='w')
df.head()

for ax, var in zip(axes.reshape(-1), df.columns):
    sns.regplot(ax=ax, x=var, y='Cancelled', data=df, logistic=True, ci = None)
    
reduced_data['StrEmploymentStatus'].value_counts()
    
## massive impact of taking employment status into account - listing "other" dramatically increases chances of not
## following through with loan, whereas having a "full-time" or "self-employed" status strongly increases chances.
## the change with employed looks small, but the sample size is huge, so very likely also significant.

# since this column looks pretty significant, we'll probably end up using OneHotEncoder or get_dummies to split a
# categorical variable into individual columns.

<IPython.core.display.Javascript object>

Employed         216678
Other             19544
Self-employed     15624
Full-time           614
Part-time             5
Not employed          4
Name: StrEmploymentStatus, dtype: int64

In [31]:
## fold Part-time and Not employed into Other
reduced_data['StrEmploymentStatus'].replace(['Part-time','Not employed'],'Other',inplace = True)
print(reduced_data['StrEmploymentStatus'].value_counts())

Employed         216678
Other             19553
Self-employed     15624
Full-time           614
Name: StrEmploymentStatus, dtype: int64


In [59]:
## For these other booleans with lots of categories, let's just use groupby
g = reduced_data.groupby('EnumListingCategory')
h = [[g['Cancelled'].mean(),g.size()]]
print(h)

## I'm not sure what the categories entail, and the changes aren't massive - leave out of analysis entirely for now.

[[EnumListingCategory
0     0.355263
1     0.322420
2     0.352350
3     0.421997
6     0.374668
7     0.376605
8     0.330864
9     0.343750
11    0.336336
12    0.518182
13    0.343267
14    0.402126
15    0.394137
16    0.371429
17    0.410959
18    0.269542
19    0.431329
20    0.351734
21    0.244094
Name: Cancelled, dtype: float64, EnumListingCategory
0        228
1     195363
2      12638
3       6301
6       2258
7      17756
8        405
9        160
11       333
12       110
13      2718
14      4046
15      5356
16       280
17       146
18      1484
19      1347
20      1413
21       127
dtype: int64]]


In [65]:
## For these other booleans with lots of categories, let's just use groupby
g = reduced_data.groupby('EnumListingCategory')
h = pd.DataFrame([g['Cancelled'].mean(),g.size()]).transpose()
h.columns = 'Cancel Prob','Size'
print(h)

## I'm not sure what the categories entail, and the changes aren't massive - leave out of analysis entirely for now.

                     Cancel Prob      Size
EnumListingCategory                       
0                       0.355263     228.0
1                       0.322420  195363.0
2                       0.352350   12638.0
3                       0.421997    6301.0
6                       0.374668    2258.0
7                       0.376605   17756.0
8                       0.330864     405.0
9                       0.343750     160.0
11                      0.336336     333.0
12                      0.518182     110.0
13                      0.343267    2718.0
14                      0.402126    4046.0
15                      0.394137    5356.0
16                      0.371429     280.0
17                      0.410959     146.0
18                      0.269542    1484.0
19                      0.431329    1347.0
20                      0.351734    1413.0
21                      0.244094     127.0


In [69]:
## For these other booleans with lots of categories, let's just use groupby
g = reduced_data.groupby('StrOccupation')
h = pd.DataFrame([g['Cancelled'].mean(),g.size()]).transpose()
h.columns = 'Cancel Prob','Size'
print(h)
print(reduced_data['StrOccupation'].value_counts())
## I'm not sure what the categories entail, and the changes aren't massive - leave out of analysis entirely for now.

##There are clearly some really rich and probably non-accidental correlations going on with different jobs and loan
##cancellation, especially given the non-negligible sample size in most professions (at least hundreds).

##I'm going to leave occupation out of the analysis for now and seek to reintegrate later.

                                    Cancel Prob     Size
StrOccupation                                           
Accountant/CPA                         0.289733   6292.0
Administrative Assistant               0.305882   4930.0
Analyst                                0.230218   6698.0
Architect                              0.291925    483.0
Attorney                               0.319324   2070.0
Biologist                              0.303191    188.0
Bus Driver                             0.424473   1185.0
Car Dealer                             0.391241    685.0
Chemist                                0.314465    318.0
Civil Service                          0.231803   2679.0
Clergy                                 0.456621    657.0
Clerical                               0.309499   6811.0
Computer Programmer                    0.227259   6442.0
Construction                           0.397276   6240.0
Dentist                                0.505556    360.0
Doctor                         

In [71]:
## For these other booleans with lots of categories, let's just use groupby
g = reduced_data.groupby('EnumChannelCode')
h = pd.DataFrame([g['Cancelled'].mean(),g.size()]).transpose()
h.columns = 'Cancel Prob','Size'
print(h)
 
## clearly, 80000 is the code used for a potential new loanee - no new information, so we can drop this one.

                 Cancel Prob      Size
EnumChannelCode                       
40000               0.347317  115347.0
50000               0.350827   14081.0
70000               0.369273   29095.0
80000               0.133648   16289.0
90000               0.341167   77657.0


In [72]:
reduced_data['BoolPriorProsperLoanee'].value_counts()

## proof that this is equivalent to being a prior loanee - had we known what EnumChannelCode was before, would've been
## slightly faster. oh well.

False    236180
True      16289
Name: BoolPriorProsperLoanee, dtype: int64

In [76]:
## end result of dealing with categoricals dealt with here

## ALWAYS DROP THIS ONE:
reduced_data.drop('EnumChannelCode', axis=1, inplace = True)

## TEMPORARILY DROP OCCUPATION until we can make rest of model work:
reduced_data.drop('StrOccupation', axis=1, inplace = True)

Unnamed: 0_level_0,DolLoanAmountRequested,BoolPartialFundingApproved,BorrowerRate,NumMonthsTerm,EnumListingCategory,DolMonthlyIncome,FracDebtToIncomeRatio,StrEmploymentStatus,NumMonthsEmployed,NumPriorProsperLoansLatePayments,NumPriorProsperLoans61dpd,BoolIsLender,BoolInGroup,NumTradesOpened6,NumOpenTradesDelinqOrPastDue6,DolTotalBalanceOnPublicRecords,NumRealEstateTrades,DolMonthlyDebt,NumCurrentDelinquencies,NumPublicRecordsLast10Years,NumPublicRecords12,DolAmountDelinquent,PctBankcardUtil,NumTotalInquiries,BoolEverWholeLoan,DaysSinceFirstCredit,Cancelled,BoolPriorProsperLoanee
ListingNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
973605,15000.0,True,0.162,60,1,6000.0,0.27,Employed,445.0,,,0,False,1,0,0,2,1242,0,0,0,0,0.97,5,True,13901,1,False
981099,15000.0,True,0.1585,60,1,7916.6667,0.35,Other,32.0,,,0,False,1,0,0,2,2289,0,0,0,0,0.48,3,True,14238,1,False
1025766,4000.0,True,0.2085,36,1,2083.3333,0.53,Employed,4.0,0.0,,0,False,0,0,0,0,911,0,0,0,0,0.93,5,False,4146,0,True
1003835,10000.0,True,0.1299,36,13,3750.0,0.14,Employed,2.0,,,0,False,1,0,0,0,223,0,0,0,0,0.26,1,True,2942,1,False
1011335,20000.0,True,0.144,60,1,9000.0,0.16,Employed,90.0,0.0,,0,False,1,0,1249,1,1264,1,2,0,0,0.81,17,False,8329,0,True


In [86]:
## let's reposition Cancelled and BoolPriorProsperLoanee to the front.
cols = reduced_data.columns
reduced_data.columns = np.append(cols[-2:],cols[:-2])

In [87]:
reduced_data.head()

Unnamed: 0_level_0,Cancelled,BoolPriorProsperLoanee,DolLoanAmountRequested,BoolPartialFundingApproved,BorrowerRate,NumMonthsTerm,EnumListingCategory,DolMonthlyIncome,FracDebtToIncomeRatio,StrEmploymentStatus,NumMonthsEmployed,NumPriorProsperLoansLatePayments,NumPriorProsperLoans61dpd,BoolIsLender,BoolInGroup,NumTradesOpened6,NumOpenTradesDelinqOrPastDue6,DolTotalBalanceOnPublicRecords,NumRealEstateTrades,DolMonthlyDebt,NumCurrentDelinquencies,NumPublicRecordsLast10Years,NumPublicRecords12,DolAmountDelinquent,PctBankcardUtil,NumTotalInquiries,BoolEverWholeLoan,DaysSinceFirstCredit
ListingNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
973605,15000.0,True,0.162,60,1,6000.0,0.27,Employed,445.0,,,0,False,1,0,0,2,1242,0,0,0,0,0.97,5,True,13901,1,False
981099,15000.0,True,0.1585,60,1,7916.6667,0.35,Other,32.0,,,0,False,1,0,0,2,2289,0,0,0,0,0.48,3,True,14238,1,False
1025766,4000.0,True,0.2085,36,1,2083.3333,0.53,Employed,4.0,0.0,,0,False,0,0,0,0,911,0,0,0,0,0.93,5,False,4146,0,True
1003835,10000.0,True,0.1299,36,13,3750.0,0.14,Employed,2.0,,,0,False,1,0,0,0,223,0,0,0,0,0.26,1,True,2942,1,False
1011335,20000.0,True,0.144,60,1,9000.0,0.16,Employed,90.0,0.0,,0,False,1,0,1249,1,1264,1,2,0,0,0.81,17,False,8329,0,True
