Imports

In [None]:
from google.colab import drive

drive.mount('/content/drive')

%cd '/content/drive/My Drive/Colab Notebooks'

!pip install import-ipynb

import import_ipynb

from hmda_data_loader import *

import pandas as pd

import numpy as np

import random as rd

import tensorflow as tf

from tensorflow.python.framework import constant_op

from tensorflow import keras

from keras import layers

from scipy.stats import chi2_contingency

Constants and variables

In [None]:
DIRECTORIES = {
    'MODELS':'/content/drive/MyDrive/Colab Notebooks/models/hmda/',
    'OUTPUT':'/content/drive/MyDrive/Colab Notebooks/ouput/hmda/'
}

Get data

In [None]:
category = 'race'

print('\nloading training data\n')

train_X,train_Y,train_ = get_data(dataset_type='train',category=category,normalization='std')

train_data = tf.convert_to_tensor(train_X,dtype='float32')
train_targets = tf.convert_to_tensor(train_[1],dtype='float32')

train_Xc = train_[0]

cat_names = train_[3]

train_data_c = tf.convert_to_tensor(train_Xc,dtype='float32')

print('train accept:\t\t',int(np.sum(train_[1])))
print('train denied:\t\t',len(train_[1]) - int(np.sum(train_[1])))
print('train accept rate:\t',np.sum(train_[1])/len(train_[1]))

print(train_data.shape)
print(train_targets.shape)
print(train_data_c.shape)

print('\nloading test data\n')

test_X,test_Y,test_ = get_data(dataset_type='test',category=category,sample=False,sample_method='random',normalization='std')

test_data = tf.convert_to_tensor(test_X,dtype='float32')

test_targets = tf.convert_to_tensor(test_[1],dtype='float32')

test_Xc = test_[0]

test_cats = test_[2]

test_data_c = tf.convert_to_tensor(test_Xc,dtype='float32')

print('test accept:\t\t',int(np.sum(test_[1])))
print('test denied:\t\t',len(test_[1]) - int(np.sum(test_[1])))
print('test accept rate:\t',np.sum(test_[1])/len(test_[1]))

print(test_data.shape)
print(test_targets.shape)
print(test_data_c.shape)

Create and train models

In [None]:
callbacks = [tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                min_delta=1e-4,
                patience=3,
                verbose=0,
                mode='auto',
                baseline=None,
                restore_best_weights=True,
                start_from_epoch=0,)]

num_epochs = 15

batch_size = 128

steps_per_epoch = ((len(train_data) * 0.8) // batch_size) + 1

activation = 'sigmoid'

learning_rate = 0.01

verbose = 1

model_path = DIRECTORIES['MODELS'] + 'standard_hdma_ca_22_bce_{}_b.keras'.format(category)

model = keras.Sequential([layers.Dense(1, activation=activation)])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.TruePositives(name='true_positives'),tf.keras.metrics.TrueNegatives(name='true_negatives'),tf.keras.metrics.FalsePositives(name='false_positives'),tf.keras.metrics.FalseNegatives(name='false_negatives')])

history = model.fit(train_data, train_targets, validation_split=0.2, shuffle=True, epochs=num_epochs, batch_size=batch_size,  steps_per_epoch=steps_per_epoch, verbose=verbose, callbacks=callbacks)

print('acc:\t',history.history['binary_accuracy'][-1])
print('loss:\t',history.history['loss'][-1])

print('tp history:\t',history.history['true_positives'][-1])
print('tn history:\t',history.history['true_negatives'][-1])
print('fp history:\t',history.history['false_positives'][-1])
print('fn history:\t',history.history['false_negatives'][-1])

model.save(model_path)

model_path = DIRECTORIES['MODELS'] + 'standard_hdma_ca_22_bce_{}_c.keras'.format(category)

model = keras.Sequential([layers.Dense(1, activation=activation)])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.TruePositives(name='true_positives'),tf.keras.metrics.TrueNegatives(name='true_negatives'),tf.keras.metrics.FalsePositives(name='false_positives'),tf.keras.metrics.FalseNegatives(name='false_negatives')])

history = model.fit(train_data_c, train_targets, validation_split=0.2, shuffle=True, epochs=num_epochs, batch_size=batch_size,  steps_per_epoch=steps_per_epoch, verbose=verbose, callbacks=callbacks)

print('acc:\t',history.history['binary_accuracy'][-1])
print('loss:\t',history.history['loss'][-1])

print('tp history:\t',history.history['true_positives'][-1])
print('tn history:\t',history.history['true_negatives'][-1])
print('fp history:\t',history.history['false_positives'][-1])
print('fn history:\t',history.history['false_negatives'][-1])

model.save(model_path)


Test and evaluate models

In [None]:
def metrics(y_true,y_pred,data_type,category,write=False,writer=None):

    mets = {
        'bac':tf.keras.metrics.BinaryAccuracy(),
        'bce':tf.keras.losses.BinaryCrossentropy(),
        'tp':tf.keras.metrics.TruePositives(),
        'tn':tf.keras.metrics.TrueNegatives(),
        'fp':tf.keras.metrics.FalsePositives(),
        'fn':tf.keras.metrics.FalseNegatives()
    }

    for n,m in mets.items():

        if n == 'bce':
            continue

        m.update_state(y_true,y_pred)

    tar = (mets['tp'].result().numpy() + mets['fn'].result().numpy()) / y_true.shape[0]

    par = (mets['tp'].result().numpy() + mets['fp'].result().numpy()) / y_true.shape[0]

    if write:

        writer.write('{},{},{},{},{},{},{},{},{},{}\n'.format(
                                                                   data_type,
                                                                   category,

                                                                   mets['bac'].result().numpy(),

                                                                   mets['bce'](y_true,y_pred).numpy(),
                                                                   par,
                                                                    tar,
                                                                   int(mets['tp'].result().numpy()),
                                                                   int(mets['tn'].result().numpy()),
                                                                   int(mets['fp'].result().numpy()),
                                                                   int(mets['fn'].result().numpy())))



    return tar,par

def test_model(model,data,targets,targets_c,cats,data_type,category,write=False,writer=None):


    cat_y_trues = {1:[],2:[]} if category == 'sex' else {1:[],2:[],3:[],4:[],5:[]} if category == 'race' else {11:[],12:[],21:[],22:[],31:[],32:[],41:[],42:[],51:[],52:[]}

    cat_y_preds = {1:[],2:[]} if category == 'sex' else {1:[],2:[],3:[],4:[],5:[]} if category == 'race' else {11:[],12:[],21:[],22:[],31:[],32:[],41:[],42:[],51:[],52:[]}

    y_pred = model.predict(data)

    metrics(targets,tf.convert_to_tensor(y_pred),data_type,category,write,writer)

    yl = list(y_pred)

    for i in range(len(cats)):

        cat_y_trues[cats[i]].append(targets_c[i])

        cat_y_preds[cats[i]].append(yl[i])

    for k,v in cat_names.items():

        metrics(tf.convert_to_tensor(cat_y_trues[k]),tf.convert_to_tensor(cat_y_preds[k]),data_type,v,write,writer)

    y_predl = [1 if y > 0.5 else 0 for y in list(y_pred)]

    y_preds = pd.Series(y_predl,name='predictions')

    catss = pd.Series(cats,name='categories')

    contigency = pd.crosstab(index=catss, columns=y_preds)

    print('\n\n',contigency)

    contigency_pct = pd.crosstab(catss, y_preds, normalize='index')

    print('\n',contigency_pct)

    c, p, dof, expected = chi2_contingency(contigency,correction=False)

    return p

def evaluate_model(model_path,data_type,category,eval_data,eval_targets,targets_c,eval_cats,batch_size=128,writer=None):

    model = tf.keras.models.load_model(model_path)

    results = model.evaluate(eval_data,eval_targets,batch_size=128)

    print('eval loss, eval acc:', results)

    return test_model(model,eval_data,eval_targets,targets_c,eval_cats,data_type,category,True,writer)




In [None]:
model_paths = {'base': DIRECTORIES['MODELS'] + 'standard_hdma_ca_22_bce_{}_b.keras',
            'category': DIRECTORIES['MODELS'] + 'standard_hdma_ca_22_bce_{}_c.keras'}

fn = DIRECTORIES['OUTPUT']  + 'standard_hmda_ca_22_{}_p_values.csv'.format(category)

f = open(fn, 'w')

f.write('model,category,p-value\n')

fn2 = DIRECTORIES['OUTPUT'] + 'standard_hmda_ca_22_{}_data.csv'.format(category)

f2 = open(fn2, 'w')

f2.write('data_type,category,bin_accuracy,bce_loss,par,tar,true_positives,true_negatives,false_positives,false_negatives\n')

p_value = evaluate_model(model_paths['base'].format(category),'base_features','all',test_data,test_targets,test_[1],test_cats,writer=f2)

f.write('{},{},{}\n'.format('base',category,p_value))

p_value = evaluate_model(model_paths['category'].format(category),'category_features','all',test_data_c,test_targets,test_[1],test_cats,writer=f2)

f.write('{},{},{}\n'.format('category','all',p_value))

f.close()

f2.close()