# HMDA Loan data

Investigating relationship between recent loan data and historical redlining in Pueblo, CO.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [2]:
data_path = 'pueblo_mortgages.csv'

In [3]:
f = open(data_path)
column_names = f.readline().split('\t')
f.close()

In [4]:
race_cols = ['applicant_race_name_1',
 'applicant_race_1',
 'applicant_race_name_2',
 'applicant_race_2',
 'applicant_race_name_3',
 'applicant_race_3',
 'applicant_race_name_4',
 'applicant_race_4',
 'applicant_race_name_5',
 'applicant_race_5']

In [5]:
race_col_idxs = []
for cname in race_cols:
    race_col_idxs.append(column_names.index(cname))

In [6]:
column_names

['# as_of_year',
 'respondent_id',
 'agency_name',
 'agency_abbr',
 'agency_code',
 'loan_type_name',
 'loan_type',
 'property_type_name',
 'property_type',
 'loan_purpose_name',
 'loan_purpose',
 'owner_occupancy_name',
 'owner_occupancy',
 'loan_amount_000s',
 'preapproval_name',
 'preapproval',
 'action_taken_name',
 'action_taken',
 'msamd_name',
 'msamd',
 'state_name',
 'state_abbr',
 'state_code',
 'county_name',
 'county_code',
 'census_tract_number',
 'applicant_ethnicity_name',
 'applicant_ethnicity',
 'co_applicant_ethnicity_name',
 'co_applicant_ethnicity',
 'applicant_race_name_1',
 'applicant_race_1',
 'applicant_race_name_2',
 'applicant_race_2',
 'applicant_race_name_3',
 'applicant_race_3',
 'applicant_race_name_4',
 'applicant_race_4',
 'applicant_race_name_5',
 'applicant_race_5',
 'co_applicant_race_name_1',
 'co_applicant_race_1',
 'co_applicant_race_name_2',
 'co_applicant_race_2',
 'co_applicant_race_name_3',
 'co_applicant_race_3',
 'co_applicant_race_name_4',
 

In [7]:
scaler = StandardScaler()

In [8]:
running_X = []

In [9]:
index = column_names.index('minority_population')
index

72

In [10]:
raw_data = np.genfromtxt(data_path, delimiter='\t', dtype='str')

In [11]:
percent_minoritized = raw_data[:,index].astype('float').reshape(-1,1)

In [12]:
percent_minoritized = scaler.fit_transform(percent_minoritized)

In [13]:
running_X.append(percent_minoritized)

In [14]:
loan_type_index = column_names.index('loan_type')

In [15]:
raw_data[:, loan_type_index]

array(['1', '1', '2', ..., '3', '1', '1'], dtype='<U81')

In [16]:
loan_type_encoder = OneHotEncoder()

In [17]:
loan_type_onehots = loan_type_encoder.fit_transform(raw_data[:, loan_type_index].reshape(-1,1))

In [18]:
loan_type_onehots = loan_type_onehots.todense()

In [19]:
running_X.append(loan_type_onehots)

In [20]:
# RACE
race = raw_data[:, column_names.index('applicant_race_1')]
race = loan_type_encoder.fit_transform(race.reshape(-1,1)).todense()

In [21]:
running_X.append(race)

In [22]:
sex_onehots = loan_type_encoder.fit_transform(raw_data[:, column_names.index('applicant_sex')].reshape(-1,1)).todense()

In [23]:
running_X.append(sex_onehots)

In [24]:
loan_amt = raw_data[:, column_names.index('loan_amount_000s')].astype('float').reshape(-1,1)
loan_amt = scaler.fit_transform(loan_amt)

In [25]:
running_X.append(loan_amt)

In [26]:
rl_score = raw_data[:, column_names.index('redlining_score\n')].astype('float').reshape(-1,1)

In [27]:
running_X.append(rl_score)

In [28]:
full_X = np.concatenate(running_X, 1)

In [29]:
full_X.shape

(3552, 16)

In [30]:
full_X[:,-1]

matrix([[0.50153497],
        [0.35456869],
        [0.        ],
        ...,
        [0.666     ],
        [0.        ],
        [0.        ]])

In [31]:
X = full_X[:, :-1]
y = full_X[:,-1]

In [32]:
l_model = LinearRegression()

In [36]:
l_model.fit(np.asarray(X), np.asarray(y))

In [37]:
import torch
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=11)

In [39]:
batch_size = 64
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train.reshape(-1,1)).float())
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [40]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 1),
    torch.nn.Sigmoid()
)

In [41]:
optimizer = torch.optim.Adam(model.parameters())

In [42]:
loss = torch.nn.MSELoss()

In [43]:
for i in range(100):
    for batch in train_loader:
        optimizer.zero_grad()

        batchX = batch[0]
        batchY = batch[1]

        preds = model(batchX)

        l = loss(preds, batchY)
        # print(loss)
        l.backward()
        optimizer.step()


In [44]:
model(torch.tensor(X_test).float())

tensor([[1.9138e-06],
        [1.5145e-01],
        [2.9175e-11],
        [3.5892e-02],
        [7.2342e-01],
        [1.7575e-02],
        [1.6929e-01],
        [1.5381e-01],
        [2.3022e-01],
        [3.5341e-01],
        [1.5040e-01],
        [1.4441e-01],
        [8.3687e-02],
        [2.8117e-02],
        [2.6521e-03],
        [5.3436e-01],
        [5.3573e-11],
        [1.7119e-01],
        [2.8541e-03],
        [1.9586e-01],
        [2.1086e-01],
        [1.1239e-01],
        [2.9093e-02],
        [2.4564e-11],
        [4.1026e-01],
        [1.4312e-10],
        [2.8813e-01],
        [9.1684e-02],
        [3.2963e-01],
        [1.0042e-01],
        [1.4792e-01],
        [4.5812e-01],
        [3.4650e-01],
        [4.1031e-01],
        [5.7318e-01],
        [1.4945e-06],
        [2.5337e-03],
        [1.5429e-01],
        [5.5844e-04],
        [2.1163e-01],
        [5.3879e-02],
        [8.5287e-02],
        [4.1543e-01],
        [7.7552e-02],
        [1.2948e-02],
        [5