In [72]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense

In [40]:
from IPython.display import display

pd.options.display.max_columns = None

In [45]:
df = pd.read_excel("Input_dataset.xlsx", sheetname='Input_dataset')
df.head()

Unnamed: 0,"financial statement type (es. Manufacturing, financial, real estate)",geographic area,counterparty ID,Default flag,Exposure,Industry,Industry 2,Type of counterparty,Date of observation,Time with bank in months (-999 = new customer)
0,3.0,SUD ISOLE,4110,0,37734.0,Edilizia,FOCUS,DITTA,2013,19
1,4.0,SUD ISOLE,5278,0,0.0,Professionisti,PROFESSIONAL,DITTA,2013,-999
2,4.0,SUD ISOLE,5297,0,46934.0,Artigianato,FOCUS,DITTA,2013,22
3,1.0,SUD ISOLE,5312,0,33333.0,Artigianato,FOCUS,DITTA,2013,15
4,4.0,SUD ISOLE,5887,0,0.0,Professionisti,PROFESSIONAL,DITTA,2013,-999


# EDA and Data Pre-processing

In [25]:
a = len(df)
a

686914

In [7]:
df['financial statement type (es. Manufacturing, financial, real estate)'].unique()

array([ 3.,  4.,  1.,  5.,  2., nan])

In [8]:
df['geographic area'].unique()

array(['SUD ISOLE', 'CENTRO', nan, 'NORD EST', 'NORD OVEST'], dtype=object)

In [9]:
df['counterparty ID'].nunique()

366042

In [10]:
df['Industry'].unique()

array(['Edilizia', 'Professionisti', 'Artigianato', 'Turismo',
       'Agricoltura', 'Commercio', 'Piccole imprese', 'Altro',
       'Consumatori'], dtype=object)

In [11]:
df['Industry 2'].unique()

array(['FOCUS', 'PROFESSIONAL'], dtype=object)

In [12]:
df['Type of counterparty'].unique()

array(['DITTA', 'SRL', 'COOP', 'SNC', 'SAS', 'SPA', 'SS', 'ALTRO', 'SDF'],
      dtype=object)

In [13]:
df['Date of observation'].unique()

array([2013, 2014, 2015])

In [34]:
default = df[df['Default flag']==1]
len(default)
print (len(default)/len(df)*100)

4.400114133647007


In [36]:
df = df.dropna(how='any')
b = len(df)
print('No of rows dropped = ', a-b)

No of rows dropped =  21175


In [37]:
df['Exposure'].describe()

count    6.657390e+05
mean     1.268008e+05
std      4.326883e+05
min      0.000000e+00
25%      2.365900e+04
50%      6.000000e+04
75%      1.493790e+05
max      1.283407e+08
Name: Exposure, dtype: float64

In [38]:
df['Time with bank in months (-999 = new customer)'].describe()

count    665739.000000
mean        -48.788419
std         245.669484
min        -999.000000
25%           5.000000
50%          11.000000
75%          20.000000
max         209.000000
Name: Time with bank in months (-999 = new customer), dtype: float64

In [6]:
default2 = df[df['Default flag']==1]
print(len(default2))
print (len(default2)/len(df)*100)

29397
4.415694438811606


# Base Models (without Class Imbalance Correction)

In [46]:
df_target = df['Default flag']
df = df.drop(['Default flag','counterparty ID'], axis = 1)

min_max_scaler = preprocessing.StandardScaler()
df['Time with bank in months (-999 = new customer)'] = df['Time with bank in months (-999 = new customer)'].replace(-999, 0)

numericals = ['Exposure', 'Time with bank in months (-999 = new customer)']
categoricals = ['financial statement type (es. Manufacturing, financial, real estate)', 
                'geographic area', 'Industry', 'Industry 2', 'Type of counterparty', 'Date of observation']

for n in numericals:
    mean = df[n].mean()
    std = df[n].std()
    df[n] = df[n].apply(lambda x: (x-mean)/std)
    #x_scaled = min_max_scaler.fit_transform(df[[n]])
    #x_scaled = pd.DataFrame(x_scaled)
    #df = pd.concat([df, x_scaled], axis=1)
    
for c in categoricals:
    dummies = pd.get_dummies(df[c], prefix = c) 
    df = pd.concat([df, dummies], axis=1)
    
    
df = df.drop(['financial statement type (es. Manufacturing, financial, real estate)', 'geographic area', 
              'Industry', 'Industry 2', 'Type of counterparty', 'Date of observation'], axis = 1)
df.head()

Unnamed: 0,Exposure,Time with bank in months (-999 = new customer),"financial statement type (es. Manufacturing, financial, real estate)_1.0","financial statement type (es. Manufacturing, financial, real estate)_2.0","financial statement type (es. Manufacturing, financial, real estate)_3.0","financial statement type (es. Manufacturing, financial, real estate)_4.0","financial statement type (es. Manufacturing, financial, real estate)_5.0",geographic area_CENTRO,geographic area_NORD EST,geographic area_NORD OVEST,geographic area_SUD ISOLE,Industry_Agricoltura,Industry_Altro,Industry_Artigianato,Industry_Commercio,Industry_Consumatori,Industry_Edilizia,Industry_Piccole imprese,Industry_Professionisti,Industry_Turismo,Industry 2_FOCUS,Industry 2_PROFESSIONAL,Type of counterparty_ALTRO,Type of counterparty_COOP,Type of counterparty_DITTA,Type of counterparty_SAS,Type of counterparty_SDF,Type of counterparty_SNC,Type of counterparty_SPA,Type of counterparty_SRL,Type of counterparty_SS,Date of observation_2013,Date of observation_2014,Date of observation_2015
0,-0.208726,0.39671,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
1,-0.296869,-1.014198,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,-0.187235,0.619485,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
3,-0.219006,0.099677,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
4,-0.296869,-1.014198,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0


In [47]:
train_features, test_features, train_labels, test_labels = train_test_split(df, df_target, test_size = 0.25, 
                                                                            random_state = 42)

In [48]:
# 1. Logistic Regression 

param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

lr = LogisticRegression(penalty='l2')

grid_search_lr = GridSearchCV(estimator = lr, param_grid = param_grid_lr, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search_lr.fit(train_features, train_labels)

grid_search_lr.best_params_

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] C=0.001 .........................................................
[CV] .......................................... C=0.001, total=   1.5s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV] .......................................... C=0.001, total=   1.0s
[CV] C=0.001 .........................................................
[CV] .......................................... C=0.001, total=   1.1s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   1.2s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   1.7s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   1.3s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   1.9s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   2.0s
[CV] C=0.1 ...........................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   43.0s finished


{'C': 0.001}

In [85]:
best_lr_model = grid_search_lr.best_estimator_
pred = best_lr_model.predict_proba(test_features)[:,1]
pred_binary = best_lr_model.predict(test_features)
ginni = 2*roc_auc_score(test_labels, pred) - 1

print ('Logistic Regression (Base Model)')
print('Ginni Coeff: ', ginni)
print('Precision: ', precision_score(test_labels, pred_binary, average="macro"))
print('Recall:', recall_score(test_labels, pred_binary, average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_binary))

Logistic Regression (Base Model)
Ginni Coeff:  0.20104197272156554
Precision:  0.4780118675354774
Recall: 0.5
Confusion Matrix:


  'precision', 'predicted', average, warn_for)


[[164177      0]
 [  7552      0]]


In [56]:
# 2. Random Forest

param_grid_rf = {
    'max_depth': [1, 5, 25, 50],
    'max_features': ['auto'],
    'min_samples_leaf': [3000, 5000, 10000],
    'n_estimators': [100, 250, 500, 1000]}

rf = RandomForestClassifier()

grid_search_rf = GridSearchCV(estimator = rf, param_grid = param_grid_rf, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search_rf.fit(train_features, train_labels)

grid_search_rf.best_params_

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=   6.2s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s


[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=   6.2s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=   6.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  18.2s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  18.6s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  16.9s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=500 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=500, total=  34.8s
[CV] max_depth=1, max_features=auto, min_samples_lea

[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100, total=  16.6s
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100, total=  23.4s
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total=  54.3s
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total= 1.1min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total= 1.2min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=500, total= 1.3min
[CV] max_depth=5, max_features=auto, min_samples_lea

[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100, total=  15.2s
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100, total=  15.2s
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total=  37.6s
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total=  37.5s
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total=  37.6s
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=500 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=500, total= 1.3min
[CV] max_depth=25, max_feature

[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 209.8min finished


{'max_depth': 1,
 'max_features': 'auto',
 'min_samples_leaf': 3000,
 'n_estimators': 100}

In [86]:
best_rf_model = grid_search_rf.best_estimator_
pred_rf = best_rf_model.predict_proba(test_features)[:,1]
pred_rf_binary = best_rf_model.predict(test_features)
ginni_rf = 2*roc_auc_score(test_labels, pred_rf) - 1

print ('Random Forest (Base Model)')
print('Ginni Coeff: ', ginni_rf)
print('Precision: ', precision_score(test_labels, pred_rf_binary, average="macro"))
print('Recall:', recall_score(test_labels, pred_rf_binary, average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_rf_binary))

Random Forest (Base Model)
Ginni Coeff:  0.18795731441355712
Precision:  0.4780118675354774
Recall: 0.5
Confusion Matrix:
[[164177      0]
 [  7552      0]]


  'precision', 'predicted', average, warn_for)


In [60]:
# 3. XG Boost

model_xgb = xgb.XGBClassifier(colsample_bytree=0.2, gamma=0.0, 
                             learning_rate=0.05, max_depth=6, 
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2,seed=42, silent=1,
                             random_state =7)

model_xgb.fit(train_features, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.2, gamma=0.0, learning_rate=0.05,
       max_delta_step=0, max_depth=6, min_child_weight=1.5, missing=None,
       n_estimators=7200, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=7, reg_alpha=0.9,
       reg_lambda=0.6, scale_pos_weight=1, seed=42, silent=1,
       subsample=0.2)

In [87]:
pred_xgb = model_xgb.predict_proba(test_features)[:,1]
pred_xgb_binary = model_xgb.predict(test_features)
ginni_xgb = 2*roc_auc_score(test_labels, pred_xgb) - 1
print ('XGBoost (Base Model)')
print('Ginni Coeff: ', ginni_xgb)
print('Precision: ', precision_score(test_labels, pred_xgb_binary, average="macro"))
print('Recall:', recall_score(test_labels, pred_xgb_binary, average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_xgb_binary))

  if diff:


XGBoost (Base Model)
Ginni Coeff:  0.2703629814757593
Precision:  0.7280145230511219
Recall: 0.5000631621335355
Confusion Matrix:
[[164176      1]
 [  7551      1]]


In [64]:
# 4. Deep Neural Nets

a = train_features.shape[1]

model2 = Sequential()
model2.add(Dense(a+50, input_dim=a, kernel_initializer='normal', activation='relu'))
model2.add(Dense(200, kernel_initializer='normal', activation='relu'))
model2.add(Dense(400, kernel_initializer='normal', activation='relu'))
model2.add(Dense(200, kernel_initializer='normal', activation='relu'))
model2.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.fit(train_features, train_labels, epochs= 3, batch_size=50,  verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3ff2d53710>

In [88]:
pred_nn = model2.predict(test_features)
pred_nn = pd.DataFrame(pred_nn,columns = ['Pred'])
pred_nn['Pred_binary'] = pred_nn['Pred'].apply(lambda x: 1 if x >= 0.5 else 0)
ginni_nn = 2*roc_auc_score(test_labels, pred_nn['Pred']) - 1
print ('Deep Neural Net (Base Model)')
print('Ginni Coeff: ', ginni_nn)
print('Precision: ', precision_score(test_labels, pred_nn['Pred_binary'], average="macro"))
print('Recall:', recall_score(test_labels, pred_nn['Pred_binary'], average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_nn['Pred_binary']))

Deep Neural Net (Base Model)
Ginni Coeff:  0.2368937344957276
Precision:  0.4780118675354774
Recall: 0.5
Confusion Matrix:
[[164177      0]
 [  7552      0]]


  'precision', 'predicted', average, warn_for)


In [91]:
# 5. Ensemble (of the above models)

pred_ensemble = (pred + pred_rf + pred_xgb + pred_nn['Pred'].values)/4
pred_ensemble = pd.DataFrame(pred_ensemble, columns = ['Pred'])
pred_ensemble['Pred_binary'] = pred_ensemble['Pred'].apply(lambda x: 1 if x>=0.5 else 0)
ginni_ensemble = 2*roc_auc_score(test_labels, pred_ensemble['Pred']) - 1
print ('Ensemble (Base Model)')
print('Ginni Coeff: ', ginni_ensemble)
print('Precision: ', precision_score(test_labels, pred_ensemble['Pred_binary'], average="macro"))
print('Recall:', recall_score(test_labels, pred_ensemble['Pred_binary'], average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_ensemble['Pred_binary']))

Ensemble (Base Model)
Ginni Coeff:  0.27137432569416875
Precision:  0.4780118675354774
Recall: 0.5
Confusion Matrix:


  'precision', 'predicted', average, warn_for)


[[164177      0]
 [  7552      0]]


# SMOTE (for Class Imbalance Correction)

In [67]:
sm = SMOTE(random_state=12, ratio = 1.0)
train_features_2, train_labels_2 = sm.fit_sample(train_features, train_labels)



In [68]:
# 1. Logistic Regression

param_grid_lr = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

lr = LogisticRegression(penalty='l2')

grid_search_lr_smote = GridSearchCV(estimator = lr, param_grid = param_grid_lr, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search_lr_smote.fit(train_features_2, train_labels_2)

grid_search_lr_smote.best_params_

best_lr_model_smote = grid_search_lr_smote.best_estimator_

pred_lr_smote = best_lr_model_smote.predict_proba(test_features)[:,1]

ginni_lr_smote = 2*roc_auc_score(test_labels, pred_lr_smote) - 1

print ('Logistic Regression (SMOTE)')
print('Ginni Coeff: ', ginni_lr_smote)

Fitting 3 folds for each of 7 candidates, totalling 21 fits
[CV] C=0.001 .........................................................
[CV] .......................................... C=0.001, total=   2.7s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.8s remaining:    0.0s


[CV] .......................................... C=0.001, total=   2.5s
[CV] C=0.001 .........................................................
[CV] .......................................... C=0.001, total=   2.6s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   3.8s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   2.8s
[CV] C=0.01 ..........................................................
[CV] ........................................... C=0.01, total=   3.7s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   4.1s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=   3.5s
[CV] C=0.1 ...........................................................
[CV] .

[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  1.5min finished


Logistic Regression (SMOTE)
Ginni Coeff:  0.21525812222814933


In [89]:
pred_binary_lr_smote_binary = best_lr_model_smote.predict(test_features)


print ('Logistic Regression (SMOTE)')

print('Precision: ', precision_score(test_labels, pred_binary_lr_smote_binary, average="macro"))
print('Recall:', recall_score(test_labels, pred_binary_lr_smote_binary, average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_binary_lr_smote_binary))

Logistic Regression (SMOTE)
Precision:  0.5138324131936419
Recall: 0.5812441818651852
Confusion Matrix:
[[92352 71825]
 [ 3021  4531]]


In [69]:
# 2. Deep Neural Nets

a = train_features.shape[1]

model3 = Sequential()
model3.add(Dense(a+50, input_dim=a, kernel_initializer='normal', activation='relu'))
model3.add(Dense(200, kernel_initializer='normal', activation='relu'))
model3.add(Dense(400, kernel_initializer='normal', activation='relu'))
model3.add(Dense(200, kernel_initializer='normal', activation='relu'))
model3.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.fit(train_features_2, train_labels_2, epochs= 3, batch_size=50,  verbose=1)

pred_nn_smote = model3.predict(test_features)
ginni_nn_smote = 2*roc_auc_score(test_labels, pred_nn_smote) - 1
print ('Deep Neural Net (SMOTE)')
print('Ginni Coeff: ', ginni_nn_smote)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Deep Neural Net (SMOTE)
Ginni Coeff:  0.19906956073813675


In [94]:
print ('Deep Neural Nets (SMOTE)')

pred_nn_smote = pd.DataFrame(pred_nn_smote, columns = ['Pred'])
pred_nn_smote['Pred_binary'] = pred_nn_smote['Pred'].apply(lambda x: 1 if x>=0.5 else 0)

print('Precision: ', precision_score(test_labels, pred_nn_smote['Pred_binary'], average="macro"))
print('Recall:', recall_score(test_labels, pred_nn_smote['Pred_binary'], average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_nn_smote['Pred_binary']))

Deep Neural Nets (SMOTE)
Precision:  0.5136257685491911
Recall: 0.5776678747199824
Confusion Matrix:
[[99917 64260]
 [ 3423  4129]]


In [97]:
# 3. XG Boost

#model_xgb_smote = xgb.XGBClassifier(colsample_bytree=0.2, gamma=0.0, 
#                             learning_rate=0.05, max_depth=6, 
#                             min_child_weight=1.5, n_estimators=7200,
#                             reg_alpha=0.9, reg_lambda=0.6,
#                             subsample=0.2,seed=42, silent=1,
#                             random_state =7)

#model_xgb_smote.fit(train_features_2, train_labels_2)

#pred_xgb_smote = model_xgb_smote.predict_proba(test_features)[:,1]
#ginni_xgb_smote = 2*roc_auc_score(test_labels, pred_xgb_smote) - 1
#print ('XGBoost (SMOTE)')
#('Ginni Coeff: ', ginni_xgb_smote)

In [71]:
# 4. Random Forest

param_grid_rf = {
    'max_depth': [1, 5, 25, 50],
    'max_features': ['auto'],
    'min_samples_leaf': [3000, 5000, 10000],
    'n_estimators': [100, 250, 500, 1000]}

rf = RandomForestClassifier()

grid_search_rf_smote = GridSearchCV(estimator = rf, param_grid = param_grid_rf, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search_rf_smote.fit(train_features_2, train_labels_2)

grid_search_rf_smote.best_params_

best_rf_model_smote = grid_search_rf_smote.best_estimator_
pred_rf_smote = best_rf_model_smote.predict_proba(test_features)[:,1]
ginni_rf_smote = 2*roc_auc_score(test_labels, pred_rf_smote) - 1

print ('Random Forest (SMOTE)')
print('Ginni Coeff: ', ginni_rf_smote)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=  16.6s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   20.2s remaining:    0.0s


[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=  17.1s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=100, total=  18.4s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  45.5s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  46.6s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=250, total=  41.5s
[CV] max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=500 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=3000, n_estimators=500, total= 1.4min
[CV] max_depth=1, max_features=auto, min_samples_lea

[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100, total= 1.1min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=100, total= 1.1min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total= 2.5min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total= 2.6min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=250, total= 2.6min
[CV] max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=500 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5000, n_estimators=500, total= 5.3min
[CV] max_depth=5, max_features=auto, min_samples_lea

[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100, total= 1.3min
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=100, total= 1.3min
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total= 3.0min
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total= 3.0min
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=250, total= 3.0min
[CV] max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=500 
[CV]  max_depth=25, max_features=auto, min_samples_leaf=10000, n_estimators=500, total= 6.9min
[CV] max_depth=25, max_feature

[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 781.9min finished


Random Forest (SMOTE)
Ginni Coeff:  0.24986634428783616


In [93]:
pred_rf_smote_binary = best_rf_model_smote.predict(test_features)


print ('Random Forest (SMOTE)')

print('Precision: ', precision_score(test_labels, pred_rf_smote_binary, average="macro"))
print('Recall:', recall_score(test_labels, pred_rf_smote_binary, average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_rf_smote_binary))

Random Forest (SMOTE)
Precision:  0.5156598858780064
Recall: 0.5893720630505181
Confusion Matrix:
[[99847 64330]
 [ 3243  4309]]


In [95]:
# 5. Ensemble of the above models

pred_ensemble_smote = (pred_lr_smote + pred_rf_smote + pred_nn_smote['Pred'].values)/3
pred_ensemble_smote = pd.DataFrame(pred_ensemble_smote, columns = ['Pred'])
pred_ensemble_smote['Pred_binary'] = pred_ensemble_smote['Pred'].apply(lambda x: 1 if x>=0.5 else 0)
ginni_ensemble_smote = 2*roc_auc_score(test_labels, pred_ensemble_smote['Pred']) - 1
print ('Ensemble (SMOTE)')
print('Ginni Coeff: ', ginni_ensemble_smote)
print('Precision: ', precision_score(test_labels, pred_ensemble_smote['Pred_binary'], average="macro"))
print('Recall:', recall_score(test_labels, pred_ensemble_smote['Pred_binary'], average="macro")) 
print('Confusion Matrix:')
print(confusion_matrix(test_labels, pred_ensemble_smote['Pred_binary']))

Ensemble (SMOTE)
Ginni Coeff:  0.24114803900410076
Precision:  0.5160056630517554
Recall: 0.5901396677713635
Confusion Matrix:
[[102273  61904]
 [  3343   4209]]
