In [None]:
!pip install imblearn

In [None]:
!pip install --upgrade sklearn

In [1]:
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV,ShuffleSplit, train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
#from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, precision_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.externals import joblib
%matplotlib inline
pd.set_option('display.max_columns', None) 
pd.set_option("display.max_rows",100)

In [None]:
pd.__version__

In [73]:
df = pd.read_pickle('/Users/pwu/dsi-plus/capstone/docker/postgres/data/final_long_low_09092017.pkl')
df_no_outlier = pd.read_pickle('/Users/pwu/dsi-plus/capstone/docker/postgres/data/final_long_low_no_outlier_09092017.pkl')

In [74]:
df.shape

(135671, 60)

In [75]:
df_no_outlier.shape

(93080, 60)

In [84]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135671 entries, 1 to 1524046
Data columns (total 60 columns):
loan_amnt                     135671 non-null float64
int_rate                      135671 non-null float64
sub_grade                     135671 non-null object
emp_length                    135671 non-null object
home_ownership                135671 non-null object
annual_inc                    135671 non-null float64
verification_status           135671 non-null object
loan_status                   135671 non-null object
purpose                       135671 non-null object
addr_state                    135671 non-null object
dti                           135671 non-null float64
delinq_2yrs                   135671 non-null float64
inq_last_6mths                135671 non-null float64
open_acc                      135671 non-null float64
pub_rec                       135671 non-null float64
revol_bal                     135671 non-null float64
revol_util                    1

In [77]:
#long_term_outlier.info(verbose = True)

In [78]:
long_term_outlier = pd.get_dummies(df, columns = ['sub_grade','emp_length','home_ownership','verification_status', 'loan_status','purpose', 'addr_state'])
y_outlier = long_term_outlier['loan_status_Fully Paid']


In [79]:
long_term_no_outlier = pd.get_dummies(df_no_outlier, columns = ['sub_grade','emp_length','home_ownership','verification_status', 'loan_status','purpose', 'addr_state'])
y_no_outlier = long_term_no_outlier['loan_status_Fully Paid']


In [80]:
long_term_outlier.shape

(135671, 165)

In [81]:
long_term_no_outlier.shape

(93080, 165)

In [82]:
long_term_outlier.drop(['loan_status_Fully Paid', 'loan_status_Charged Off'], axis = 1, inplace = True)

In [83]:
long_term_no_outlier.drop(['loan_status_Fully Paid', 'loan_status_Charged Off'], axis = 1, inplace = True)

In [85]:
X_train_out, X_test_out, y_train_out, y_test_out = train_test_split(long_term_outlier, y_outlier, stratify = y_outlier, test_size=0.3, random_state=42)

In [86]:
X_train_no_out, X_test_no_out, y_train_no_out, y_test_no_out = train_test_split(long_term_no_outlier, y_no_outlier, stratify = y_no_outlier, test_size=0.3, random_state=42)

In [87]:
sm = SMOTE(random_state=12, ratio = 'auto')

In [88]:
X_train_res_out, y_train_res_out = sm.fit_sample(X_train_out, y_train_out)

In [89]:
X_train_res_no_out, y_train_res_no_out = sm.fit_sample(X_train_no_out, y_train_no_out)

In [None]:
#BASELINE - Outliers - NO oversampling

In [90]:
#Create  AdaBoostpipeline
adapipe = Pipeline([
    ('gbc', AdaBoostClassifier())
])
#Create parameters list
adaparams = {
    
    'gbc__learning_rate':[1,1.5],
    'gbc__n_estimators':[250,300] 
}
#grid search
adags = GridSearchCV(adapipe,
                      adaparams,
                      cv=ShuffleSplit(n_splits=5, random_state=42), 
                      n_jobs= -1
                     )

In [91]:
adamodel = adags.fit(X_train_out, y_train_out)

In [92]:
joblib.dump(adamodel, '/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel1.pkl')


['/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel1.pkl']

In [34]:
adamodel.best_params_

{'gbc__learning_rate': 1, 'gbc__n_estimators': 300}

In [35]:
adamodel.best_estimator_

Pipeline(memory=None,
     steps=[('gbc', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=300, random_state=None))])

In [42]:
#Baseline Training Score

In [36]:
y_train_out_pred = adags.predict(X_train_out)

In [37]:
 tn, fp, fn, tp = confusion_matrix(y_train_out, y_train_out_pred).ravel()
(tn, fp, fn, tp)

(54528, 6581, 24471, 9389)

In [38]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.8923
accuracy: 0.673
precision: 0.5879
recall/sensitivity: 0.2773


In [39]:
adaresult_df = pd.DataFrame(list(zip(X_train_out.columns,adamodel.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(20)

Unnamed: 0,Feature_Name,Feature_Importance
1,int_rate,0.063333
3,dti,0.06
2,annual_inc,0.05
47,tot_hi_cred_lim,0.04
0,loan_amnt,0.036667
10,total_acc,0.03
27,mths_since_recent_bc,0.03
23,mo_sin_old_rev_tl_op,0.03
51,fico_mean,0.026667
19,bc_util,0.026667


In [None]:
#Testing Data Score

In [112]:
y_pred_out = adamodel.predict(X_test_out)

In [113]:
 tn, fp, fn, tp = confusion_matrix(y_test_out, y_pred_out).ravel()
(tn, fp, fn, tp)

(3918, 10594, 3002, 23188)

In [114]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.27
accuracy: 0.666
precision: 0.6864
recall/sensitivity: 0.8854


In [43]:
adaresult_df = pd.DataFrame(list(zip(X_test_out.columns,adamodel.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(10)

Unnamed: 0,Feature_Name,Feature_Importance
1,int_rate,0.063333
3,dti,0.06
2,annual_inc,0.05
47,tot_hi_cred_lim,0.04
0,loan_amnt,0.036667
10,total_acc,0.03
27,mths_since_recent_bc,0.03
23,mo_sin_old_rev_tl_op,0.03
51,fico_mean,0.026667
19,bc_util,0.026667


In [None]:
# No Outliers - No Oversampling

In [19]:
#Create  AdaBoostpipeline
adapipe2 = Pipeline([
    ('gbc', AdaBoostClassifier())
])
#Create parameters list
adaparams2 = {
    
   'gbc__learning_rate':[1,1.5],
    'gbc__n_estimators':[250,300] 
}
#grid search
adags2 = GridSearchCV(adapipe,
                      adaparams,
                      cv=ShuffleSplit(n_splits=5, random_state=42), 
                      n_jobs= -1
                     )

In [93]:
adamodel2 = adags2.fit(X_train_no_out, y_train_no_out)

In [94]:
joblib.dump(adamodel2, '/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel2.pkl')

['/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel2.pkl']

In [44]:
adamodel2.best_params_

{'gbc__learning_rate': 1, 'gbc__n_estimators': 250}

In [45]:
adamodel2.best_estimator_

Pipeline(memory=None,
     steps=[('gbc', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=250, random_state=None))])

In [None]:
#Training Data Score

In [115]:
y_train_pred_no_out = adamodel2.predict(X_train_no_out)

In [116]:
 tn, fp, fn, tp = confusion_matrix(y_train_no_out, y_train_pred_no_out).ravel()
(tn, fp, fn, tp)

(6660, 16885, 4616, 36995)

In [117]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.2829
accuracy: 0.67
precision: 0.6866
recall/sensitivity: 0.8891


In [118]:
adaresult_df = pd.DataFrame(list(zip(X_train_no_out.columns,adamodel2.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(20)

Unnamed: 0,Feature_Name,Feature_Importance
3,dti,0.046667
1,int_rate,0.046667
47,tot_hi_cred_lim,0.043333
2,annual_inc,0.04
8,revol_bal,0.04
0,loan_amnt,0.036667
18,bc_open_to_buy,0.033333
15,total_rev_hi_lim,0.033333
17,avg_cur_bal,0.03
48,total_bal_ex_mort,0.03


In [None]:
#Testing Data Score

In [119]:
y_pred_no_out = adamodel2.predict(X_test_no_out)

In [120]:
tn, fp, fn, tp = confusion_matrix(y_test_no_out, y_pred_no_out).ravel()
(tn, fp, fn, tp)

(2700, 7391, 2089, 15744)

In [121]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.2676
accuracy: 0.6605
precision: 0.6805
recall/sensitivity: 0.8829


In [54]:
adaresult_df = pd.DataFrame(list(zip(X_test_no_out.columns,adamodel2.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(20)


Unnamed: 0,Feature_Name,Feature_Importance
3,dti,0.08
1,int_rate,0.06
47,tot_hi_cred_lim,0.048
2,annual_inc,0.044
0,loan_amnt,0.036
18,bc_open_to_buy,0.032
22,mo_sin_old_il_acct,0.028
48,total_bal_ex_mort,0.028
23,mo_sin_old_rev_tl_op,0.024
49,total_bc_limit,0.024


In [None]:
#Outliers with Sampling

In [20]:
#Create  AdaBoostpipeline
adapipe3 = Pipeline([
    #('scaler', StandardScaler()),
    ('gbc', AdaBoostClassifier())
])
#Create parameters list
adaparams3 = {
    
   'gbc__learning_rate':[1,1.5],
    'gbc__n_estimators':[250,300]  
}
#grid search
adags3 = GridSearchCV(adapipe,
                      adaparams,
                      cv=ShuffleSplit(n_splits=5, random_state=42), 
                      n_jobs= -1
                     )


In [95]:
adamodel3 = adags3.fit(X_train_res_out, y_train_res_out)

In [96]:
joblib.dump(adamodel3, '/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel3.pkl')

['/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel3.pkl']

In [108]:
y_pred_res_out = adamodel3.predict(X_test_out)

In [109]:
tn, fp, fn, tp = confusion_matrix(y_test_out, y_pred_res_out).ravel()
(tn, fp, fn, tp)

(4457, 10055, 3698, 22492)

In [110]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.3071
accuracy: 0.6621
precision: 0.6911
recall/sensitivity: 0.8588


In [111]:
adaresult_df = pd.DataFrame(list(zip(X_test_out.columns,adamodel3.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(20)

Unnamed: 0,Feature_Name,Feature_Importance
5,inq_last_6mths,0.083333
101,purpose_debt_consolidation,0.053333
79,emp_length_10+ years,0.04
7,pub_rec,0.04
4,delinq_2yrs,0.036667
3,dti,0.03
98,verification_status_Verified,0.03
117,addr_state_CA,0.03
1,int_rate,0.03
97,verification_status_Source Verified,0.026667


In [None]:
#No Outliers with Sampling

In [21]:
#Create  AdaBoostpipeline
adapipe4 = Pipeline([
    #('scaler', StandardScaler()),
    ('gbc', AdaBoostClassifier())
])
#Create parameters list
adaparams4 = {
    
   'gbc__learning_rate':[1,1.5],
    'gbc__n_estimators':[250,300]  
}
#grid search
adags4 = GridSearchCV(adapipe,
                      adaparams,
                      cv=ShuffleSplit(n_splits=5, random_state=42), 
                      n_jobs= -1
                     )


In [97]:
adamodel4 = adags4.fit(X_train_res_no_out, y_train_res_no_out)

In [98]:
joblib.dump(adamodel4, '/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel4.pkl')

['/Users/pwu/dsi-plus/capstone/docker/postgres/data/adamodel4.pkl']

In [103]:
adamodel4.best_estimator_

Pipeline(memory=None,
     steps=[('gbc', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=300, random_state=None))])

In [99]:
y_train_res_out = adamodel4.predict(X_train_res_no_out)

In [100]:
y_pred_res_no_out = adamodel4.predict(X_test_no_out)

In [101]:
tn, fp, fn, tp = confusion_matrix(y_test_no_out, y_pred_res_no_out).ravel()
(tn, fp, fn, tp)

(2973, 7118, 2442, 15391)

In [102]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
accuracy = (tp + tn) / (tp + tn + fp + fn)
specificity = tn/(tn+fp)
print('specificity: {}'.format(specificity.round(4)))
print('accuracy: {}'.format(accuracy.round(4)))
print('precision: {}'.format(precision.round(4)))
print('recall/sensitivity: {}'.format(recall.round(4)))

specificity: 0.2946
accuracy: 0.6576
precision: 0.6838
recall/sensitivity: 0.8631


In [104]:
adaresult_df = pd.DataFrame(list(zip(X_test_no_out.columns,adamodel4.best_estimator_.named_steps['gbc'].feature_importances_)))
adaresult_df.columns = ['Feature_Name', 'Feature_Importance']
adaresult_df.sort_values(['Feature_Importance'], ascending = False).head(20)

Unnamed: 0,Feature_Name,Feature_Importance
5,inq_last_6mths,0.06
101,purpose_debt_consolidation,0.06
79,emp_length_10+ years,0.056667
80,emp_length_2 years,0.04
97,verification_status_Source Verified,0.04
98,verification_status_Verified,0.04
117,addr_state_CA,0.03
1,int_rate,0.03
4,delinq_2yrs,0.03
81,emp_length_3 years,0.026667
