In [None]:
#import modules
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import  ExtraTreesClassifier

#ignore Warnings
import warnings
warnings.filterwarnings("ignore")

#set show all columns
pd.set_option("display.max_columns",None)

#plotting
%matplotlib inline
import matplotlib.pyplot as plt

<h2 align = "center"> Data Cleansing

In [None]:
#read in data
df_reference = pd.read_csv('data_YearsAdded_OutliersDropped.csv', sep=',')

df=df_reference

#create binary response variable
df["depAB"] = np.where(np.isin(df['grade'], ["A", "B"]), 1 , 0)

#make bins for employee length
df["emp_length_new"] = np.where(df['emp_length']=="10+ years", "10+ years",
                                     np.where(np.isin(df['emp_length'], ["1 year",'2 years','3 years','4 years','5 years']), "1-5 years",
                                              np.where(np.isin(df['emp_length'], ['6 years','7 years','8 years','9 years']), "6-9 years",
                                                       np.where(np.isin(df['emp_length'], ["< 1 year"]), "<1 year", "n/a"))))
df = df.drop('emp_length', axis = 1)


<h2 align = "center"> Feature Selection

In [None]:
pre_df = df
pre_df.columns

In [None]:
#Feature selection

pre_df = pre_df.loc[df_reference['Year'] == 2014]


y=['depAB']
X=['acc_open_past_24mths_x', 'annual_inc_x', 'bc_util_x',
       'dti_x', 'inq_last_6mths_x', 'mo_sin_old_rev_tl_op_x',
       'mo_sin_rcnt_rev_tl_op_x', 'mo_sin_rcnt_tl_x', 'mort_acc_x',
       'mths_since_recent_bc_x', 'num_accts_ever_120_pd_x', 'num_actv_bc_tl_x',
       'num_il_tl_x', 'num_bc_sats_x', 'open_acc_x', 'pub_rec_bankruptcies_x',
       'pub_rec_x', 'recoveries_x', 'tot_cur_bal_x', 'total_bc_limit_x']


In [None]:
#Train model
X_train, X_test, y_train, y_test = train_test_split(pre_df[X], pre_df[y], random_state=42)


<h2 align = "center"> Random Forest

In [None]:
#Run model
#Grid Search
rnd_clf = RandomForestClassifier(random_state=42, 
                                 n_jobs=-1,
                                 max_depth=8,
                                 n_estimators=500,
                                 max_features=5, 
                                 criterion='gini')

rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

In [None]:
#random search
rnd_clf = RandomForestClassifier(bootstrap=True,
                                 max_depth=20,
                                 max_features='sqrt',
                                 min_samples_leaf=2,
                                 min_samples_split=2,
                                 n_estimators=1200)

rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
bagging_clf = BaggingClassifier(KNeighborsClassifier(n_jobs=-1),
                             max_samples=0.5, max_features=5).fit(X_train, y_train)

In [None]:
y_pred = bagging_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> Logistic Regression

In [None]:
#Import modules
from sklearn.linear_model import LogisticRegression

In [None]:
#split test train data
X_train, X_test, y_train, y_test = train_test_split(pre_df[X], pre_df[y])  

In [None]:
#fit model
log_clf = LogisticRegression(random_state=0, solver='lbfgs',
                          multi_class='multinomial').fit(X_train, y_train)

In [None]:
#accuracy score
y_pred = log_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='gini',
                                                        max_depth=2,
                                                        max_leaf_nodes=5,
                                                        min_samples_leaf=10,
                                                        min_samples_split=2),
                                                       learning_rate=0.1,
                                                       n_estimators=500,
                                                       random_state=29)

ada_clf.fit(X_train, y_train)

y_pred = ada_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
GB_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
     max_depth=3, random_state=0).fit(X_train, y_train)
GB_clf.score(X_test, y_test) 

<h2 align = "center"> Naieve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
NB_clf = GaussianNB()
NB_clf.fit(pre_df[X], pre_df[y])


NB_clf.fit(X_train, y_train)

y_pred = NB_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> XGBoost

In [None]:
#import module
from xgboost import XGBClassifier

In [None]:
# fit model no training data
XGB_clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.1, learning_rate=0.0005,
       max_delta_step=0, max_depth=8, min_child_weight=15, missing=None,
       n_estimators=10000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.5)
XGB_clf.fit(X_train, y_train)

y_pred = XGB_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

<h2 align = "center"> Voting Classifier

In [None]:
voting_clf = VotingClassifier(
    estimators=[ ('rf', rnd_clf),                              
               ('Ada', ada_clf),
               ('GB', GB_clf),               
               ('XGB', XGB_clf)],
    n_jobs=-1,
    voting='hard') #Predict the class with the highest class probability averaged over all individual classifiers

voting_clf.fit(X_train, y_train)
for clf in ( rnd_clf,
            
            ada_clf,
            GB_clf,
            
            XGB_clf,
            voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

<h2 align="center"> Grid Search

In [None]:
#Record the time it takes to complete grid search
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

- **Random Forest**

In [None]:
#define grid search
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
    'min_samples_split' :[2,3,4,5],
    
    'bootstrap' :[True, False]
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train, y_train)

#print parameters
CV_rfc.best_params_

- **XGBoost**

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

params = {
        'min_child_weight': [1, 5, 10, 15, 20],
        'gamma': [0.1, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.4, 0.5, 0.6, 0.8, 1.0],
        'colsample_bytree': [0.2, 0.3, 0.4, 0.5,0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'n_estimators': [200, 500, 1000]
    
        }
xgb = XGBClassifier(learning_rate=0.005, objective='binary:logistic',
                    silent=True, n_jobs=-1)

folds = 5
param_comb = 50

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb,
                                   param_distributions=params,
                                   n_iter=param_comb,
                                   scoring='roc_auc',
                                   n_jobs=-1,
                                   cv=skf.split(X_train,y_train),
                                   verbose=3,
                                   random_state=42 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable
                    
            

In [None]:
print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

- **AdaBoost**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.grid_search import GridSearchCV

param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": [100, 500, 1000],
              "max_depth" : [1, 2, 3, 4, 5, 6, 7],
              "learning_rate" : [0.5, 0.25, 0.05]
              
             }


DTC = DecisionTreeClassifier(random_state = 42,
                             max_features = "auto",
                             class_weight = "auto",
                             max_depth = None)

ABC = AdaBoostClassifier(base_estimator = DTC)

# run grid search
grid_search_ABC = GridSearchCV(ABC, param_grid=param_grid, scoring = 'roc_auc')
grid_search_ABC

In [None]:
from datetime import datetime
def tune_score_model(model, param_grid, X, y, n_jobs=-1, cv=3):
    
    #Runs a GridSearchCV for the model and param_grid passed into the function
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', n_jobs=n_jobs, verbose=1, cv=cv)
    grid.fit(X_train, y_train)
    
    #Returns the best score and params used to get the score
    return grid.best_score_, grid.best_params_

In [None]:
print(best_params)
best_scores

<h2 align="center"> Graphing Important Features

In [None]:
#important features Graph

import seaborn as sns
classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                                      max_depth=1,
                                                                      max_leaf_nodes=None,
                                                                     min_samples_leaf=5,
                                                                      min_samples_split=2), 
                                                                    learning_rate=0.1,
                                                                    n_estimators=500,
                                                                    random_state=29)
classifier.fit(X_train, y_train)

feature_imp = pd.DataFrame(data=classifier.feature_importances_, index=X_train.columns.values, columns=['values'])
feature_imp.sort_values(['values'], ascending=False, inplace=True)
feature_imp.reset_index(level=0, inplace=True)
sns.barplot(x='index', y='values', data=feature_imp, palette='deep')
plt.xticks(rotation=90)
plt.show()

In [None]:


import seaborn as sns
classifier = rnd_clf = RandomForestClassifier(bootstrap=True,
                                 max_depth=20,
                                 max_features='sqrt',
                                 min_samples_leaf=2,
                                 min_samples_split=2,
                                 n_estimators=1200)#important features Graph
classifier.fit(X_train, y_train)

feature_imp = pd.DataFrame(data=classifier.feature_importances_, index=X_train.columns.values, columns=['values'])
feature_imp.sort_values(['values'], ascending=False, inplace=True)
feature_imp.reset_index(level=0, inplace=True)
sns.barplot(x='index', y='values', data=feature_imp, palette='deep')
plt.xticks(rotation=90)
plt.show()


<h3 align="center"> Random Search

- **Random Forest**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.svm import SVC
import scipy as sp

def tune(X , y, search_type, n_iter):
    scores = []
    params = []
    for i in range(len(n_iter)):
        scaler = StandardScaler()
        clf = SVC()
        pipe = Pipeline(steps=[('scaler', scaler), 
                               ('svc', clf)])
        if search_type == 'grid':
            param_grid = dict(svc__C = np.logspace(-2, 5, np.round(n_iter[i]**0.5)), svc__gamma = np.logspace(-5, 1, np.round(n_iter[i]**0.5)))
            gridsearch = GridSearchCV(pipe, param_grid = param_grid, cv = 3)
            gridsearch.fit(X, y)
            scores.append(gridsearch.best_score_)
            params.append(gridsearch.best_params_)
        elif search_type == 'random':
            param_distributions = {'svc__C': sp.stats.expon(scale=10), 
            'svc__gamma': sp.stats.expon(scale=0.1)}
            randsearch = RandomizedSearchCV(pipe, param_distributions = param_distributions, n_iter= n_iter[i], cv = 3, random_state = 333)
            randsearch.fit(X, y)
            scores.append(randsearch.best_score_)
            params.append(randsearch.best_params_)
        
        print(search_type, "with", str(n_iter[i]), "iterations completed")
    
    return scores, params

In [None]:
n_iterations = [9, 25, 64, 100, 169]


scores_random, params_random = tune(X_train, y_train, 'random', n_iterations)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

<h2 align="center"> Post Loan Analysis

In [None]:
post_df = df.loc[df_reference['Year'] != 2014]


y=['depAB']
X=['acc_open_past_24mths_x', 'annual_inc_x', 'bc_util_x',
       'dti_x', 'inq_last_6mths_x', 'mo_sin_old_rev_tl_op_x',
       'mo_sin_rcnt_rev_tl_op_x', 'mo_sin_rcnt_tl_x', 'mort_acc_x',
       'mths_since_recent_bc_x', 'num_accts_ever_120_pd_x', 'num_actv_bc_tl_x',
       'num_il_tl_x', 'num_bc_sats_x', 'open_acc_x', 'pub_rec_bankruptcies_x',
       'pub_rec_x', 'recoveries_x', 'tot_cur_bal_x', 'total_bc_limit_x']


In [None]:
#Train model
X_train, X_test, y_train, y_test = train_test_split(post_df[X], post_df[y], random_state=42)

In [None]:
# fit model no training data
XGB_clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.1, learning_rate=0.0005,
       max_delta_step=0, max_depth=8, min_child_weight=15, missing=None,
       n_estimators=10000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.5)
XGB_clf.fit(X_train, y_train)

y_pred = XGB_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))

In [None]:
#correlation matrix
print(classification_report(y_test, y_pred))

In [None]:
#2014 model on 2016/17 data
#Train model
X_train, X_test, y_train, y_test = train_test_split(pre_df[X], pre_df[y], random_state=42)
XGB_clf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.1, learning_rate=0.0005,
       max_delta_step=0, max_depth=8, min_child_weight=15, missing=None,
       n_estimators=10000, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.5)
XGB_clf.fit(X_train, y_train)

y_pred = XGB_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))



In [None]:
X_train, X_test, y_train, y_test = train_test_split(post_df[X], post_df[y], random_state=42)


y_pred = XGB_clf.predict(X_test)
print( accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))