Imports

In [None]:
from google.colab import drive

drive.mount('/content/drive')

import pandas as pd

import numpy as np

import random as rd

from scipy.stats import chi2_contingency

Constants and Variables

In [None]:
FILEPATHS = {

            'HMDA_TRAIN':'/content/drive/MyDrive/Colab Notebooks/data/hmda_ca_train.csv',
            'HMDA_TEST':'/content/drive/MyDrive/Colab Notebooks/data/hmda_ca_test.csv'


}

OUTDIR = '/content/drive/MyDrive/Colab Notebooks/output/'

DTYPES = {
                    'index':'Int64',
                    'race':'Int64',
                    'sex':'Int64',
                    'age':object,
                    'income':'float64',
                    'debt_to_income_ratio':object,
                    'loan_amount':'float64',
                    'property_value':'float64',
                    'loan_term':'float64',
                    'loan_to_value_ratio':'float64',
                    'conforming_loan_limit':object,
                    'loan_type':'Int64',
                    'loan_purpose':'Int64',
                    'derived_loan_product_type':object,
                    'purchaser_type':'Int64',
                    'occupancy_type':'Int64',
                    'construction_method':'Int64',
                    'business_or_commercial_purpose':'Int64',
                    'derived_dwelling_category':object,
                    'total_units':object,
                    'lien_status':'Int64',
                    'open-end_line_of_credit':'Int64',
                    'manufactured_home_secured_property_type':'Int64',
                    'manufactured_home_land_property_interest':'Int64',
                    'hoepa_status':'Int64',
                    'applicant_credit_score_type':'Int64',
                    'aus-1':'Int64',
                    'submission_of_application':'Int64',
                    'tract_population':'Int64',
                    'tract_minority_population_percent':'float64',
                    'ffiec_msa_md_median_family_income':'Int64',
                    'tract_to_msa_income_percentage':'Int64',
                    'tract_owner_occupied_units':'float64',
                    'tract_one_to_four_family_homes':'Int64',
                    'tract_median_age_of_housing_units':'Int64',
                    'action_taken':'Int64'
}

FEATURES_BASE = [
                    'income',
                    'debt_to_income_ratio',
                    'loan_amount',
                    'property_value',
                    'loan_term',
                    'loan_to_value_ratio',
                    'conforming_loan_limit',
                    'loan_type',
                    'loan_purpose',
                    'derived_loan_product_type',
                    'purchaser_type',
                    'occupancy_type',
                    'construction_method',
                    'business_or_commercial_purpose',
                    'derived_dwelling_category',
                    'total_units',
                    'lien_status',
                    'open-end_line_of_credit',
                    'manufactured_home_secured_property_type',
                    'manufactured_home_land_property_interest',
                    'hoepa_status',
                    'applicant_credit_score_type',
                    'aus-1',
                    'submission_of_application',
                    'tract_population',
                    'tract_minority_population_percent',
                    'ffiec_msa_md_median_family_income',
                    'tract_to_msa_income_percentage',
                    'tract_owner_occupied_units',
                    'tract_one_to_four_family_homes',
                    'tract_median_age_of_housing_units'
                ]

FEATURES_RACE = FEATURES_BASE.copy()
FEATURES_SEX = FEATURES_BASE.copy()
FEATURES_AGE = FEATURES_BASE.copy()
FEATURES_SIMU = FEATURES_BASE.copy()
FEATURES_RACE_SEX = FEATURES_BASE.copy()

FEATURES_RACE.append('race')
FEATURES_SEX.append('sex')
FEATURES_AGE.append('age')
FEATURES_SIMU.append('simu')
FEATURES_RACE_SEX.append('race_sex')

Data and preprocessing functions

In [None]:
def normalize_data_div(df):

    cols = list(df.columns)

    mv = df.max()

    for col in cols:

        df[col] = df[col].div(mv[col])

    return df

def normalize_data_std(df):

    return (df-df.mean())/df.std()

def merge_ycs(Y,C):

    merged = []

    for y,c in zip(Y,C):

        c.insert(0, y)

        merged.append(c)

    return merged


def shuffle_data(Xb,Xc,Y,R,C):

    zipped = list(zip(Xb,Xc,Y,R,C))

    rd.shuffle(zipped)

    return zip(*zipped)

def preprocess_data(Xb,Xc,Y,R,C):

    Xb,Xc,Y,R,C = shuffle_data(Xb,Xc,Y,R,C)

    Xb,Xc,Y = np.array(Xb), np.array(Xc), np.array(Y)

    Xb,Xc,Y = Xb.astype('float64'), Xc.astype('float64'), Y.astype('float64')

    Yc = np.array(merge_ycs(Y,C))

    return Xb,Xc,Yc,Y,R

def get_category(s,c,c1):

    print('get category')

    c.remove(c1)

    s = s.replace(c,0)

    return s.replace(c1,1)


def simulate_data(df,sample_method='random'):

    sg_names = {
                  1:'simulated group 1',
                  2:'simulated group 2',
                  3:'simulated group 3'
    }


    if sample_method == 'balanced':

        t0 = df.loc[df['action_taken'] == 0]

        t1 = df.loc[df['action_taken'] == 1]

        n = len(df.index) // 6

        sg1 = pd.concat([t0.sample(n),t1.sample(n)])

    else:

        n = len(df.index) // 3

        sg1 = df.sample(n=n)

    sg1['simu'] = sg1.loc[:, 'sex']

    sg1.loc[(sg1['sex'] == 1), 'simu'] = 1
    sg1.loc[(sg1['sex'] == 2), 'simu'] = 1

    sg2 = sg1.copy()
    sg3 = sg1.copy()

    sg2.loc[(sg2['simu'] == 1), 'simu'] = 2
    sg3.loc[(sg3['simu'] == 1), 'simu'] = 3

    return pd.concat([sg1,sg2,sg3]), sg_names


def combine_race_sex(df,sort=True):


    rs_names = {
                  11:'native female',
                  12:'native male',

                  21:'asian female',
                  22:'asian male',

                  31:'black female',
                  32:'black male',

                  41:'pi female',
                  42:'pi male',

                  51:'white female',
                  52:'white male'
    }

    df['race_sex'] = df.loc[:, 'race']

    df.loc[(df['race'] == 1) & (df['sex'] == 1), 'race_sex'] = 11
    df.loc[(df['race'] == 1) & (df['sex'] == 2), 'race_sex'] = 12

    df.loc[(df['race'] == 2) & (df['sex'] == 1), 'race_sex'] = 21
    df.loc[(df['race'] == 2) & (df['sex'] == 2), 'race_sex'] = 22

    df.loc[(df['race'] == 3) & (df['sex'] == 1), 'race_sex'] = 31
    df.loc[(df['race'] == 3) & (df['sex'] == 2), 'race_sex'] = 32

    df.loc[(df['race'] == 4) & (df['sex'] == 1), 'race_sex'] = 41
    df.loc[(df['race'] == 4) & (df['sex'] == 2), 'race_sex'] = 42

    df.loc[(df['race'] == 5) & (df['sex'] == 1), 'race_sex'] = 51
    df.loc[(df['race'] == 5) & (df['sex'] == 2), 'race_sex'] = 52

    return df, rs_names

def simplify(df):

    print('simplify')


    r_names = {
        1:'native',
        2:'asian',
        3:'black',
        4:'pi',
        5:'white'
    }

    s_names = {
        1:'female',
        2:'male'
    }

    # sex

    df.loc[df['sex'] == 1, 'sex'] = 0

    df.loc[df['sex'] == 2, 'sex'] = 1

    df.loc[df['sex'] == 0, 'sex'] = 2

    # debt ratio

    df.loc[df['debt_to_income_ratio'] == '<20%', 'debt_to_income_ratio'] = 10

    df.loc[df['debt_to_income_ratio'] == '20%-<30%', 'debt_to_income_ratio'] = 25

    df.loc[df['debt_to_income_ratio'] == '30%-<36%', 'debt_to_income_ratio'] = 33

    df.loc[df['debt_to_income_ratio'] == '50%-60%', 'debt_to_income_ratio'] = 55

    df.loc[df['debt_to_income_ratio'] == '>60%', 'debt_to_income_ratio'] = 80

    df['debt_to_income_ratio'] = pd.to_numeric(df['debt_to_income_ratio'])

    # total units

    i = 1

    for v in ['1','2','3','4','5-24','25-49','50-99','100-149','>149']:

        df.loc[df['total_units'] == v, 'total_units'] = i

        i+=1

    df['total_units'] = pd.to_numeric(df['total_units'])

    # change to zero

    for col in ['manufactured_home_secured_property_type','submission_of_application']:

        for v in [3,4]:

            df.loc[df[col] == v, col] = 0

        df[col] = pd.to_numeric(df[col])

    # loan purpose

    df.loc[df['loan_purpose'] == 5, 'loan_purpose'] = 0
    df.loc[df['loan_purpose'] == 4, 'loan_purpose'] = 5
    df.loc[df['loan_purpose'] == 31, 'loan_purpose'] = 3
    df.loc[df['loan_purpose'] == 32, 'loan_purpose'] = 4

    # conforming loan limit

    i = 1

    for v in ['C','NC','U']:

        df.loc[df['conforming_loan_limit'] == v, 'conforming_loan_limit'] = i

        i+=1

    df['conforming_loan_limit'] = pd.to_numeric(df['conforming_loan_limit'])

    # derived_loan_product_type

    i = 1

    for v in ['Conventional:First Lien','FHA:First Lien','VA:First Lien','FSA/RHS:First Lien','Conventional:Subordinate Lien','FHA:Subordinate Lien','VA:Subordinate Lien','FSA/RHS:Subordinate Lien']:

        df.loc[df['derived_loan_product_type'] == v, 'derived_loan_product_type'] = i

        i+=1

    df['derived_loan_product_type'] = pd.to_numeric(df['derived_loan_product_type'])

    # derived_dwelling_category

    i = 1

    for v in ['Single Family (1-4 Units):Manufactured','Single Family (1-4 Units):Site-Built','Multifamily:Site-Built','Multifamily:Manufactured']:

        df.loc[df['derived_dwelling_category'] == v, 'derived_dwelling_category'] = i

        i+=1

    df['derived_dwelling_category'] = pd.to_numeric(df['derived_dwelling_category'])


    df.loc[df['manufactured_home_land_property_interest'] == 5, 'manufactured_home_land_property_interest'] = 0


    df.loc[df['applicant_credit_score_type'] == 9, 'applicant_credit_score_type'] = 0

    return df, r_names, s_names


def get_data(dataset_type='train',category='race',sample=False,sample_method='random',normalization='std'):

    target = 'action_taken'

    n_cats = 2 if category == 'sex' else 3 if category == 'simu' else 5 if category == 'race' else 10

    xb_cols = FEATURES_BASE

    xc_cols = FEATURES_SEX if category == 'sex' else FEATURES_SIMU if category == 'simu' else FEATURES_RACE if category == 'race' else FEATURES_RACE_SEX

    df = None

    if dataset_type == 'train':

        df = pd.read_csv(FILEPATHS['HMDA_TRAIN'],dtype=DTYPES,na_values=[])

    else:

        df = pd.read_csv(FILEPATHS['HMDA_TEST'],dtype=DTYPES,na_values=[])

    print('target:\t\t\t{}\ncategory feature:\t{}\n# of categories:\t{}\ndataset type:\t\t{}'.format(target,category,n_cats,dataset_type))

    # convert

    df, r_names, s_names = simplify(df)

    c_names = r_names if category == 'race' else s_names

    if category == 'simu':

        df, c_names = simulate_data(df)

    elif category == 'race_sex':

        df, c_names = combine_race_sex(df)

    contigency = pd.crosstab(index=df[category], columns=df[target])

    print('\n\n',contigency)

    c, p, dof, expected = chi2_contingency(contigency,correction=False)

    print('\nc:\t\t',c)
    print('p_value:\t',p)
    print('dof:\t',dof)
    print('expected: ',expected)

    # get y and x data

    xb_data = df[xb_cols].copy()

    xc_data = df[xc_cols].copy()

    y_data = df[target].copy()

    r_data = df[category].copy()

    print('data b NA values:\t',xb_data.columns[xb_data.isna().any()].tolist())

    print('data c NA values:\t',xc_data.columns[xc_data.isna().any()].tolist())

    # create categories

    cats = [1,2] if category == 'sex' else [1,2,3] if category == 'simu' else [1,2,3,4,5] if category == 'race' else [11,12,21,22,31,32,41,42,51,52]

    c_data = {}

    for c in cats:

        c_data[c] = get_category(df[category].copy(),cats.copy(),c)

    # normalize data

    xb_data = normalize_data_std(xb_data) if normalization == 'std' else normalize_data_div(xb_data)

    xc_data = normalize_data_std(xc_data) if normalization == 'std' else normalize_data_div(xc_data)

    Xb = xb_data.values.tolist()

    Xc = xc_data.values.tolist()

    print('data size b:\t\t',len(Xb))

    print('data size c:\t\t',len(Xc))

    Y = list(y_data)

    R = list(r_data)

    C = []

    for k,c in c_data.items():

        C.append(list(c))

        print('C{}:\t\t\t{}\t\t{}'.format(k,int(sum(list(c))),c_names[k]))

    C = np.array(C)

    C = C.T

    C = C.tolist()

    Xb,Xc,Yc,Y,R = preprocess_data(Xb,Xc,Y,R,C)

    return Xb,Yc,[Xc,Y,R,c_names]

