In [1]:
from FeatureExtraction import FeatureExtraction
from Graph import Model
from ModelDataGenerator import ModelDataGenerator
import pandas as pd
import numpy as np
import tensorflow as tf

# Model Selection

In [2]:
MAIN_MODEL = False
MAIN_ORIGIN_MODEL = False
NPL_MODEL = True
NPL_ORIGIN_MODEL = False

## Helper Functions

In [3]:
def get_data_dir():
    if MAIN_MODEL:
        return "Data/main"
    elif MAIN_ORIGIN_MODEL:
        return "Data/main_origin"
    elif NPL_MODEL:
        return "Data/npl"
    elif NPL_ORIGIN_MODEL:
        return "Data/npl_origin"
    else: 
        raise ValueError('No model type is selected.')

def get_model_dir():
    if MAIN_MODEL:
        return "TrainedModels/main/"
    elif MAIN_ORIGIN_MODEL:
        return "TrainedModels/main_origin/"
    elif NPL_MODEL:
        return "TrainedModels/npl/"
    elif NPL_ORIGIN_MODEL:
        return "TrainedModels/npl_origin/"
    else: 
        raise ValueError('No model type is selected.')

def get_data():
    print get_data_dir()
    store = pd.HDFStore(get_data_dir() + ".h5")

    if 'chunk_size' in store:
        chunk_size = store['chunk_size'][0][0]
        df = pd.DataFrame()
        for i in range(0, chunk_size):
            data = store['data_' + str(i)]
            df = pd.concat([df, data], axis=0)
    else :
        df = store['data']
        backup.close()
    return df

# Start

In [4]:
df_all = get_data()

Data/npl


In [5]:
# View features
print df_all.columns.values

['svcg_cycle' 'st' 'occr_default_per_state'
 'occr_default_per_state_12_mon' 'occr_paid_off_per_state'
 'occr_paid_off_per_state_12_mon' 'new_loans_per_state_12_mon'
 'active_loans_per_state' 'new_loans_per_state' 'zipcode'
 'occr_default_per_zipcode' 'occr_default_per_zipcode_12_mon'
 'occr_paid_off_per_zipcode' 'occr_paid_off_per_zipcode_12_mon'
 'new_loans_per_zipcode_12_mon' 'active_loans_per_zipcode'
 'new_loans_per_zipcode' 'nat_int_rt' 'unemploy_rt' 'hous_prc_indx_st'
 'label_good_bad_loan' 'id_loan' 'label_month_final' 'fico' 'dt_first_pi'
 'flag_fthb' 'dt_matr' 'cd_msa' 'mi_pct' 'cnt_units' 'occpy_sts' 'cltv'
 'dti' 'orig_upb' 'ltv' 'int_rt' 'channel' 'ppmt_pnlty' 'prod_type'
 'prop_type' 'loan_purpose' 'orig_loan_term' 'cnt_borr' 'flag_sc'
 'current_upb' 'delq_sts' 'loan_age' 'mths_remng' 'repch_flag' 'flag_mod'
 'cd_zero_bal' 'dt_zero_bal' 'current_int_rt' 'non_int_brng_upb'
 'dt_lst_pi' 'mi_recoveries' 'net_sale_proceeds' 'non_mi_recoveries'
 'expenses' 'legal_costs' 'maint

#### Isolate Default and Fully Paid loans

In [6]:
# default loan instances 
df_default = df_all.loc[df_all['label_good_bad_loan'] == 0]
print "Number of default instances: " + str(len(df_default))

# fully paid loan instances
df_fully_paid = df_all.loc[df_all['label_good_bad_loan'] == 1]
print "Number of fully paid instances: " + str(len(df_fully_paid))

# Number of loans to pass to model 
# CONDITION: num_of_default + num_of_fully_paid > 10
num_of_default = 3
num_of_fully_paid = 5

# add fully paid and default instances to start of dataframe
df = pd.concat([df_default.head(num_of_default), df_fully_paid.head(num_of_fully_paid)], axis=0)
df.reset_index(inplace=True)

Number of default instances: 25123
Number of fully paid instances: 508510


#### View and Change feature values

In [7]:
# Print features values of instance at index 0
print df_fully_paid.iloc[0]['fico']
print df_fully_paid.iloc[0]['dti']
print df_fully_paid.iloc[0]['int_rt']
print df_fully_paid.iloc[0]['ltv']
print df_fully_paid.iloc[0]['occr_default_per_state_12_mon']
print df_fully_paid.iloc[0]['occr_default_per_zipcode_12_mon']
print df_fully_paid.iloc[0]['hous_prc_indx_st']

707.0
36.0
3.99
79.0
201
3
212.23


In [8]:
# EX1: Example Parameters for default - Change values of loan instance at index 0
df.loc[0, 'index'] = -1
df.loc[0, 'fico'] = 649
df.loc[0, 'dti'] = 47
df.loc[0, 'int_rt'] = 9.75
df.loc[0, 'ltv'] = 90.0
df.loc[0, 'occr_default_per_state_12_mon'] = 20
df.loc[0, 'occr_default_per_zipcode_12_mon'] = 5
df.loc[0, 'hous_prc_indx_st'] = 93.51
df.loc[1, 'label_good_bad_loan'] = 0 # Set label to default


# EX2: Example Parameters for Fully Paid - Change values of loan instance at index 0
df.loc[1, 'index'] = -2
df.loc[1, 'fico'] = 850
df.loc[1, 'dti'] = 30
df.loc[1, 'int_rt'] = 7.75
df.loc[1, 'ltv'] = 54.0
df.loc[1, 'occr_default_per_state_12_mon'] = 5
df.loc[1, 'occr_default_per_zipcode_12_mon'] = 0
df.loc[1, 'hous_prc_indx_st'] = 99.96
df.loc[1, 'label_good_bad_loan'] = 1 # Set label to Fully Paid

What should happen? e.g. Lowering 'fico' value should increase the probability of default which will be reflected in EX1/EX2 predicted output value (below).

In [9]:
# View dataframe to validate above changes have occured 
    # (using index -1/-2 should make the corrisponding rows more distinct to the reader)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

df.reset_index()

Unnamed: 0,level_0,index,svcg_cycle,st,occr_default_per_state,occr_default_per_state_12_mon,occr_paid_off_per_state,occr_paid_off_per_state_12_mon,new_loans_per_state_12_mon,active_loans_per_state,new_loans_per_state,zipcode,occr_default_per_zipcode,occr_default_per_zipcode_12_mon,occr_paid_off_per_zipcode,occr_paid_off_per_zipcode_12_mon,new_loans_per_zipcode_12_mon,active_loans_per_zipcode,new_loans_per_zipcode,nat_int_rt,unemploy_rt,hous_prc_indx_st,label_good_bad_loan,id_loan,label_month_final,fico,dt_first_pi,flag_fthb,dt_matr,cd_msa,mi_pct,cnt_units,occpy_sts,cltv,dti,orig_upb,ltv,int_rt,channel,ppmt_pnlty,prod_type,prop_type,loan_purpose,orig_loan_term,cnt_borr,flag_sc,current_upb,delq_sts,loan_age,mths_remng,repch_flag,flag_mod,cd_zero_bal,dt_zero_bal,current_int_rt,non_int_brng_upb,dt_lst_pi,mi_recoveries,net_sale_proceeds,non_mi_recoveries,expenses,legal_costs,maint_pres_costs,taxes_ins_costs,misc_costs,actual_loss,modcost,status_month_0,label_month_1,label_month_2,label_month_3,label_month_4,label_month_5,label_month_6,label_month_7,label_month_8,label_month_9,label_month_10,label_month_11,label_month_12,label_month_13,label_month_14,label_month_15,label_month_16,label_month_17,label_month_18,time_since_origin,pct_change,crt_minus_nat_int_rt,occr_crt_less_than_nat_int_rate,label_prepaid_ratio,occr_curr_12_mon,occr_curr,occr_30dd_12_mon,occr_30dd,occr_60dd_12_mon,occr_60dd,occr_90dd_12_mon,occr_90dd,occr_foreclosed_12_mon,occr_foreclosed,ones,small,rt_default_per_zipcode,rt_default_per_zipcode_12_mon,rt_default_per_state,rt_default_per_state_12_mon
0,0,-1,201611,NV,1318,20,136727,9446,915,5895,145223,89100.0,727,5,59757,4394,393,2828,63771,3.77,0.0,93.51,0,F116Q2296489,4,649.0,201608.0,U,204607,29820.0,0.0,1.0,O,80.0,47.0,139000.0,90.0,9.75,R,N,FRM,PU,C,360.0,1.0,0,139000.0,3,4.0,356,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,0.0,0.48,0,17375.0,2.0,2,1.0,1,1.0,1,1.0,1,0.0,0,1,0.0001,0.0114,0.002474,0.009076,0.002205
1,1,-2,201612,NV,1320,5,137747,9791,774,4989,145223,89100.0,727,0,59965,4553,335,2408,63771,4.2,0.0,99.96,1,F116Q2296489,4,850.0,201608.0,U,204607,29820.0,0.0,1.0,O,80.0,30.0,139000.0,54.0,7.75,R,N,FRM,PU,C,360.0,1.0,0,139000.0,4,5.0,355,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,6,0.0,0.05,0,17375.0,2.0,2,1.0,1,1.0,1,2.0,2,0.0,0,1,0.0001,0.0114,0.002906,0.009089,0.002806
2,2,102,201701,NV,1320,16,137747,9962,628,3965,145223,89100.0,727,8,59965,4625,274,1920,63771,4.15,0.0,161.66,0,F116Q2296489,4,739.0,201608.0,U,204607,29820.0,0.0,1.0,O,80.0,45.0,139000.0,80.0,4.25,R,N,FRM,PU,C,360.0,1.0,0,139000.0,5,6.0,354,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,7,0.0,0.1,0,17375.0,2.0,2,1.0,1,1.0,1,3.0,3,0.0,0,1,0.0001,0.0114,0.004165,0.009089,0.004034
3,3,0,201703,CA,7598,201,1898643,157818,7852,30653,1929430,93600.0,148,3,20349,1382,59,356,21854,4.2,0.0,212.23,1,F116Q2018297,6,707.0,201606.0,U,204605,23420.0,0.0,1.0,O,79.0,36.0,314000.0,79.0,3.99,R,N,FRM,SF,N,360.0,1.0,0,311397.78,3,10.0,350,,,1.0,0.0,3.99,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,11,0.0,-0.21,4,26166.666667,8.0,8,1.0,1,1.0,1,1.0,1,0.0,0,1,0.0001,0.006772,0.008404,0.003938,0.006557
4,4,1,201704,CA,7609,196,1905322,152355,5260,23152,1929430,93600.0,148,3,20349,1340,41,259,21854,4.04,0.0,214.9,1,F116Q2018297,6,707.0,201606.0,U,204605,23420.0,0.0,1.0,O,79.0,36.0,314000.0,79.0,3.99,R,N,FRM,SF,N,360.0,1.0,0,0.0,4,11.0,349,N,,1.0,201704.0,3.99,0.0,201611.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,12,0.0,-0.05,5,26166.666667,8.0,8,1.0,1,1.0,1,2.0,2,0.0,0,1,0.0001,0.006772,0.011539,0.003944,0.008465
5,5,2,201612,VA,1403,39,460572,30439,1218,13212,472669,23200.0,109,2,24324,1686,70,809,26011,4.2,0.0,178.73,1,F116Q2025762,6,751.0,201606.0,U,204605,40060.0,0.0,1.0,O,80.0,35.0,148000.0,80.0,4.25,R,N,FRM,SF,C,360.0,1.0,0,147386.12,3,7.0,353,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,3,3,3,6,6,6,6,6,6,6,6,6,6,6,6,6,8,0.002627,0.05,0,10571.428571,5.0,5,1.0,1,1.0,1,1.0,1,0.0,0,1,0.0001,0.00419,0.002469,0.002968,0.002952
6,6,3,201701,VA,1405,40,462279,30748,981,10613,472669,23200.0,109,2,24324,1706,60,684,26011,4.15,0.0,178.49,1,F116Q2025762,6,751.0,201606.0,U,204605,40060.0,0.0,1.0,O,80.0,35.0,148000.0,80.0,4.25,R,N,FRM,SF,C,360.0,1.0,0,147386.12,4,8.0,352,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,3,3,6,6,6,6,6,6,6,6,6,6,6,6,6,6,9,0.0,0.1,0,10571.428571,5.0,5,1.0,1,1.0,1,2.0,2,0.0,0,1,0.0001,0.00419,0.00292,0.002972,0.003769
7,7,4,201702,VA,1405,37,462279,30651,820,8906,472669,23200.0,109,2,24324,1683,48,589,26011,4.17,0.0,179.56,1,F116Q2025762,6,751.0,201606.0,U,204605,40060.0,0.0,1.0,O,80.0,35.0,148000.0,80.0,4.25,R,N,FRM,SF,C,360.0,1.0,0,147386.12,5,9.0,351,,,1.0,0.0,4.25,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,3,3,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,10,0.0,0.08,0,10571.428571,5.0,5,1.0,1,1.0,1,3.0,3,0.0,0,1,0.0001,0.00419,0.00339,0.002972,0.004154


In [10]:
# Format data for model

if 'index' in df.columns:
    df = df.drop('index', 1)
    
# Add more data for column-wise normalisation method
df_tmp = pd.concat([df, df_all.head(1000)])
df_OHE = FeatureExtraction().filter_main(df_tmp)

--- prepare_labels: 0.0302119255066 seconds ---
--- create_ohc: 1.91645479202 seconds ---
--- end: 0.00238800048828 seconds ---


# Model 

In [11]:
modelDataGenerator = ModelDataGenerator()
modelDataGenerator.load_data(df_OHE, ['label_good_bad_loan_0', 'label_good_bad_loan_1'])
# Remove added data used for column-wise normalisation method
modelDataGenerator._data = (modelDataGenerator._data[0][0:len(df)], modelDataGenerator._data[1][0:len(df)])

In [12]:
CLASS_COUNT = modelDataGenerator._class_count
INPUT_LENGTH = modelDataGenerator._input_length
MODEL_DIR = get_model_dir()

#### Restore Model

In [13]:
tf.reset_default_graph()

# Build the graph for the deep net
with tf.name_scope('inputs'):
    x = tf.placeholder(tf.float32, [None, INPUT_LENGTH])
    y_ = tf.placeholder(tf.float32, [None, CLASS_COUNT])

with tf.name_scope('model'):
    y_conv = Model().network(x, False, CLASS_COUNT)

x_shape = [-1, INPUT_LENGTH]

saver = tf.train.Saver(tf.global_variables(), max_to_keep=1, save_relative_paths=True)
sess = tf.Session()

# First let's load meta graph and restore weights
saver = tf.train.import_meta_graph(MODEL_DIR + 'model.meta')
saver.restore(sess, tf.train.latest_checkpoint(MODEL_DIR))

INFO:tensorflow:Restoring parameters from TrainedModels/npl/model


#### Get model data

In [14]:
(data, labels) = modelDataGenerator.getData()

#### Run model 

In [15]:
prediction = tf.nn.softmax(y_conv)

model_output = sess.run(prediction,
                        feed_dict={ x: data.reshape(x_shape) })

In [16]:
model_output = np.around(model_output, decimals=3)
labels_pred = np.round(model_output) 

# Probability of default for first sample
print "EX1: P(Default | i = 0) = " +  str( model_output[:,0][0] ) # P(Default)
print "EX2: P(Default | i = 1) = " +  str( model_output[:,0][1] ) # P(Default)
print "\n"

# Print model output
print "Labels real      = " + str(labels[:,0] ) # 0 = Fully Paid, 1 = Default
print "Labels predicted = " + str(labels_pred[:,0] ) # 0 = Fully Paid, 1 = Default
print "P(Default)       = " +  str( model_output[:,0] ) # P(Default)

EX1: P(Default | i = 0) = 0.835
EX2: P(Default | i = 1) = 0.468


Labels real      = [1 0 1 1 1 1 1 0]
Labels predicted = [1. 0. 1. 1. 1. 1. 1. 0.]
P(Default)       = [0.835 0.468 0.605 0.725 0.716 0.605 1.    0.194]


#### Get stats

In [17]:
labels_real = labels[:,0]
labels_pred = labels_pred[:,0]

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_curve, auc, roc_auc_score

print("Confusion Matrix: " )
cm = confusion_matrix(labels_real, labels_pred)
print cm
print
NPV = float(cm[1][1] / float(cm[1][0] + cm[1][1]))
Precision = float(cm[0][0] / float(cm[0][0] + cm[0][1]))
roc_auc = roc_auc_score(labels_real, labels_pred)
print "TPR (Recall): %.4f" % NPV
print "TNR (Specificity): %.4f" % Precision
print "ROC AUC: %.4f" % roc_auc
print
acc = accuracy_score(labels_real, labels_pred)
print "Accuracy: %.4f" % acc

Confusion Matrix: 
[[2 0]
 [0 6]]

TPR (Recall): 1.0000
TNR (Specificity): 1.0000
ROC AUC: 1.0000

Accuracy: 1.0000


In [18]:
# Confusion Matrix: 
#     Top Left:     Number of correctly classified Fully Paid loans
#     Bottom Right: Number of correctly classified Default loans
#     Top Right:    Number of incorrectly classifier Fully Paid loans 
#     Bottom Left:  Number of incorrectly classifier Default loans  
    