In [23]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

In [24]:
df = pd.read_csv('Desktop/Thinkful/LoanStats3d.csv', skipinitialspace=True,
    header=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [25]:
df.head(3)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,68009401,72868139.0,16000.0,16000.0,16000.0,60 months,14.85%,379.39,C,C5,...,0.0,2.0,78.9,0.0,0.0,2.0,298100.0,31329.0,281300.0,13400.0
1,68354783,73244544.0,9600.0,9600.0,9600.0,36 months,7.49%,298.58,A,A4,...,0.0,2.0,100.0,66.7,0.0,0.0,88635.0,55387.0,12500.0,75635.0
2,68466916,73356753.0,25000.0,25000.0,25000.0,36 months,7.49%,777.55,A,A4,...,0.0,0.0,100.0,20.0,0.0,0.0,373572.0,68056.0,38400.0,82117.0


## The Blind Approach

Now, as we've seen before, creating a model is the easy part. Let's try just using everything we've got and throwing it without much thought into a Random Forest. SKLearn requires the independent variables to be be numeric, and all we want is dummy variables so let's use `get_dummies` from Pandas to generate a dummy variable for every categorical column and see what happens off of this kind of naive approach.

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()
X = df.drop('loan_status', 1)
Y = df['loan_status']
X = pd.get_dummies(X)

cross_val_score(rfc, X, Y, cv = 5)

## Data Cleaning

Lets look at all our categorical variables and see how many distinct counts there are...

In [26]:
categorical = df.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

id
421097
term
2
int_rate
110
grade
7
sub_grade
35
emp_title
120812
emp_length
11
home_ownership
4
verification_status
3
issue_d
12
loan_status
7
pymnt_plan
1
url
421095
desc
34
purpose
14
title
27
zip_code
914
addr_state
49
earliest_cr_line
668
revol_util
1211
initial_list_status
2
last_pymnt_d
25
next_pymnt_d
4
last_credit_pull_d
26
application_type
2
verification_status_joint
3


Well that right there is what's called a problem. Some of these have over a hundred thousand distinct types. Lets drop the ones with over 30 unique values, converting to numeric where it makes sense. In doing this there's a lot of code that gets written to just see if the numeric conversion makes sense. It's a manual process that we'll abstract away and just include the conversion.

You could extract numeric features from the dates, but here we'll just drop them. There's a lot of data, it shouldn't be a huge problem.

In [27]:
# Convert ID and Interest Rate to numeric
df['id'] = pd.to_numeric(df['id'], errors = 'coerce')
df['int_rate'] = pd.to_numeric(df['int_rate'].str.strip('%'), errors = 'coerce')

# Drop other columns with many unique variables

df.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc'], 1, inplace = True)

Wonder what was causing the dtype error on the id column, which should have all been integers? Let's look at the end of the file.

In [28]:
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
421092,36271333.0,38982739.0,13000.0,13000.0,13000.0,60 months,15.99,316.07,D,5 years,...,0.0,3.0,100.0,50.0,1.0,0.0,51239.0,34178.0,10600.0,33239.0
421093,36490806.0,39222577.0,12000.0,12000.0,12000.0,60 months,19.99,317.86,E,1 year,...,1.0,2.0,95.0,66.7,0.0,0.0,96919.0,58418.0,9700.0,69919.0
421094,36271262.0,38982659.0,20000.0,20000.0,20000.0,36 months,11.99,664.2,B,10+ years,...,0.0,1.0,100.0,50.0,0.0,1.0,43740.0,33307.0,41700.0,0.0
421095,,,,,,,,,,,...,,,,,,,,,,
421096,,,,,,,,,,,...,,,,,,,,,,


In [29]:
# Remove two summary rows at the end that don't actually contain data.
df = df[:-2]

In [30]:
df.tail()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_length,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
421090,36371250.0,39102635.0,10000.0,10000.0,10000.0,36 months,11.99,332.1,B,8 years,...,0.0,1.0,100.0,100.0,0.0,0.0,32950.0,25274.0,9200.0,15850.0
421091,36441262.0,39152692.0,24000.0,24000.0,24000.0,36 months,11.99,797.03,B,10+ years,...,0.0,2.0,56.5,100.0,0.0,0.0,152650.0,8621.0,9000.0,0.0
421092,36271333.0,38982739.0,13000.0,13000.0,13000.0,60 months,15.99,316.07,D,5 years,...,0.0,3.0,100.0,50.0,1.0,0.0,51239.0,34178.0,10600.0,33239.0
421093,36490806.0,39222577.0,12000.0,12000.0,12000.0,60 months,19.99,317.86,E,1 year,...,1.0,2.0,95.0,66.7,0.0,0.0,96919.0,58418.0,9700.0,69919.0
421094,36271262.0,38982659.0,20000.0,20000.0,20000.0,36 months,11.99,664.2,B,10+ years,...,0.0,1.0,100.0,50.0,0.0,1.0,43740.0,33307.0,41700.0,0.0


Now this should be better. Let's try again.

In [31]:
pd.get_dummies(df)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,...,last_credit_pull_d_Nov-2016,last_credit_pull_d_Oct-2015,last_credit_pull_d_Oct-2016,last_credit_pull_d_Sep-2015,last_credit_pull_d_Sep-2016,application_type_INDIVIDUAL,application_type_JOINT,verification_status_joint_Not Verified,verification_status_joint_Source Verified,verification_status_joint_Verified
0,68009401.0,72868139.0,16000.0,16000.0,16000.0,14.85,379.39,48000.0,33.18,0.0,...,0,0,0,0,0,1,0,0,0,0
1,68354783.0,73244544.0,9600.0,9600.0,9600.0,7.49,298.58,60000.0,22.44,0.0,...,0,0,0,0,0,1,0,0,0,0
2,68466916.0,73356753.0,25000.0,25000.0,25000.0,7.49,777.55,109000.0,26.02,0.0,...,0,0,0,0,0,1,0,0,0,0
3,68466961.0,73356799.0,28000.0,28000.0,28000.0,6.49,858.05,92000.0,21.60,0.0,...,0,0,0,0,0,1,0,0,0,0
4,68495092.0,73384866.0,8650.0,8650.0,8650.0,19.89,320.99,55000.0,25.49,0.0,...,0,0,0,0,0,1,0,0,0,0
5,68506798.0,73396623.0,23000.0,23000.0,23000.0,8.49,471.77,64000.0,18.28,0.0,...,0,0,0,0,0,1,0,0,0,0
6,68566886.0,73456723.0,29900.0,29900.0,29900.0,12.88,678.49,65000.0,21.77,0.0,...,0,0,0,0,0,1,0,0,0,0
7,68577849.0,73467703.0,18000.0,18000.0,18000.0,11.99,400.31,112000.0,8.68,0.0,...,0,0,0,0,0,1,0,0,0,0
8,66310712.0,71035433.0,35000.0,35000.0,35000.0,14.85,829.90,110000.0,17.06,0.0,...,0,0,0,0,0,1,0,0,0,0
9,68476807.0,73366655.0,10400.0,10400.0,10400.0,22.45,289.91,104433.0,25.37,1.0,...,0,0,0,0,0,1,0,0,0,0


It finally works! We had to sacrifice sub grade, state address and description, but that's fine. If you want to include them you could run the dummies independently and then append them back to the dataframe.

## Second Attempt

Now let's try this model again.

We're also going to drop NA columns, rather than impute, because our data is rich enough that we can probably get away with it.

This model may take a few minutes to run.

In [32]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
rfc = ensemble.RandomForestClassifier()
X = df.drop('loan_status', 1)
Y = df['loan_status']
X = pd.get_dummies(X)
X = X.dropna(axis=1)

cross_val_score(rfc, X, Y, cv=10)



array([0.98021895, 0.9801477 , 0.98093135, 0.98162   , 0.96891475,
       0.97838993, 0.93877793, 0.98074046, 0.97983708, 0.98062034])

The score cross validation reports is the accuracy of the tree. Here we're about 98% accurate. **Accuracy** is how close a measured value is to the actual (true) value

That works pretty well, but there are a few potential problems. Firstly, we didn't really do much in the way of feature selection or model refinement. As such there are a lot of features in there that we don't really need. Some of them are actually quite impressively useless.

There's also some variance in the scores. The fact that one gave us only 93% accuracy while others gave higher than 98 is concerning. This variance could be corrected by increasing the number of estimators. That will make it take even longer to run, however, and it is already quite slow.

# DRILL: Third Attempt

So here's your task. Get rid of as much data as possible without dropping below an average of 90% accuracy in a 10-fold cross validation.

You'll want to do a few things in this process. First, dive into the data that we have and see which features are most important. This can be the raw features or the generated dummies. You may want to use PCA or correlation matrices

## 1. Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

The example below uses the chi squared (chi^2) statistical test for non-negative features to select 4 of the best features 

In [33]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [34]:
#numerical = df.select_dtypes(include=['int', 'float'])

In [36]:
#array = numerical.values
#X = numerical[:,0:8]
#Y = numerical[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[1.734e+10 1.851e+10 2.848e+06 2.848e+06 2.839e+06 3.177e+04 6.344e+04
 3.071e+07 1.820e+04 5.402e+02 4.988e+03 3.178e+02 1.964e+02 1.860e+07
 7.431e+03 1.177e+09 1.177e+09 5.875e+08 5.873e+08 9.388e+08 3.303e+07
 4.828e+05 3.048e+08 5.341e+07 2.884e+09 9.570e+01 0.000e+00 1.978e+01
 1.608e+05 2.536e+08 4.052e+07 1.813e+04 3.343e+07 2.011e+01 1.275e+05
 8.202e+04 4.609e+04 3.172e+04 4.986e+03 4.562e+02 1.671e+03 3.109e+03
 1.511e+02 1.961e+03 4.441e+03 2.596e+02 3.355e+03 3.237e+02 8.788e+00
 1.188e+02 1.182e+04 1.969e+02 7.980e+02 3.942e+02 3.112e+08 3.005e+06
 4.017e+07 1.981e+06 1.168e+03 2.397e+03 4.901e+03 2.571e+03 8.723e+01
 2.786e+03 4.624e+03 4.109e+03 1.660e+03 4.390e+01 1.924e+02 1.984e+01
 3.920e+00 6.659e+00 3.306e+00 6.552e+00 7.814e+00 5.900e+00 4.631e+00
 4.662e+01 9.302e-01 7.828e+02 1.481e+01 8.774e+02 1.291e+03 1.623e+01
 1.192e+03 5.591e+02 1.748e+02 2.254e+03 1.092e+03 3.311e+03 1.574e+01
 9.633e+01 8.246e+02 3.465e+02 1.373e+03 1.031e+03 2.521e+02 0.000e+00
 2.761

# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [45]:

pca = PCA(n_components=5)
X2 = pca.fit_transform(X)


In [46]:
rfc.fit(X,Y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:

rfc1_fi = rfc.feature_importances_
indicies = np.argsort(rfc1_fi)
feat_names = X.columns

In [48]:
# Function to print the name and gini importance of each feature
def feat_importance(feat_names, model):
    for feature in zip(feat_names, model.feature_importances_):
        print(feature)

In [49]:
feat_importance(feat_names, rfc)

('id', 0.003197352282297878)
('member_id', 0.002648053210357592)
('loan_amnt', 0.010072880565264725)
('funded_amnt', 0.005999597336135382)
('funded_amnt_inv', 0.007019301928885368)
('int_rate', 0.0030650321351553492)
('installment', 0.007802707377695427)
('annual_inc', 0.0020583887887574903)
('dti', 0.002276849264743594)
('delinq_2yrs', 0.0005830093684539843)
('inq_last_6mths', 0.0006804398577938995)
('open_acc', 0.0012867556968383812)
('pub_rec', 0.000394929079652644)
('revol_bal', 0.0022636171933354365)
('total_acc', 0.0016319441678736245)
('out_prncp', 0.17545077953869495)
('out_prncp_inv', 0.112109500196573)
('total_pymnt', 0.030786002788133003)
('total_pymnt_inv', 0.04433913422243831)
('total_rec_prncp', 0.03884648181622664)
('total_rec_int', 0.013503421319581502)
('total_rec_late_fee', 0.002890545283530937)
('recoveries', 0.016975585511429032)
('collection_recovery_fee', 0.018932833372708395)
('last_pymnt_amnt', 0.06494633265794894)
('collections_12_mths_ex_med', 0.00013286594224

In [50]:
from sklearn.feature_selection import SelectFromModel
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(rfc, threshold=0.01)

# Train the selector
sfm.fit(X, Y)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.01)

In [51]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_names[feature_list_index])

funded_amnt
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
recoveries
collection_recovery_fee
last_pymnt_amnt
last_pymnt_d_Dec-2016
last_pymnt_d_Jan-2017
last_pymnt_d_Nov-2016
last_pymnt_d_Oct-2016
last_pymnt_d_Sep-2016
next_pymnt_d_Feb-2017


### Create A Data Subset With Only The Most Important Features

In [52]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important = sfm.transform(X)

### Train A New Random Forest Classifier Using Only Most Important Features

In [53]:
# Create a new random forest classifier for the most important features

cross_val_score(rfc, X_important, Y, cv=10)

array([0.966, 0.973, 0.977, 0.98 , 0.969, 0.975, 0.972, 0.979, 0.974,
       0.981])