# HMDA Loan data

In [163]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [169]:
data_path = '/Users/maxperozek/py4sci/pueblo_mortgages.csv'

In [170]:
f = open(data_path)
column_names = f.readline().split('\t')
f.close()

In [171]:
race_cols = ['applicant_race_name_1',
 'applicant_race_1',
 'applicant_race_name_2',
 'applicant_race_2',
 'applicant_race_name_3',
 'applicant_race_3',
 'applicant_race_name_4',
 'applicant_race_4',
 'applicant_race_name_5',
 'applicant_race_5']

In [172]:
race_col_idxs = []
for cname in race_cols:
    race_col_idxs.append(column_names.index(cname))

In [173]:
column_names

['# as_of_year',
 'respondent_id',
 'agency_name',
 'agency_abbr',
 'agency_code',
 'loan_type_name',
 'loan_type',
 'property_type_name',
 'property_type',
 'loan_purpose_name',
 'loan_purpose',
 'owner_occupancy_name',
 'owner_occupancy',
 'loan_amount_000s',
 'preapproval_name',
 'preapproval',
 'action_taken_name',
 'action_taken',
 'msamd_name',
 'msamd',
 'state_name',
 'state_abbr',
 'state_code',
 'county_name',
 'county_code',
 'census_tract_number',
 'applicant_ethnicity_name',
 'applicant_ethnicity',
 'co_applicant_ethnicity_name',
 'co_applicant_ethnicity',
 'applicant_race_name_1',
 'applicant_race_1',
 'applicant_race_name_2',
 'applicant_race_2',
 'applicant_race_name_3',
 'applicant_race_3',
 'applicant_race_name_4',
 'applicant_race_4',
 'applicant_race_name_5',
 'applicant_race_5',
 'co_applicant_race_name_1',
 'co_applicant_race_1',
 'co_applicant_race_name_2',
 'co_applicant_race_2',
 'co_applicant_race_name_3',
 'co_applicant_race_3',
 'co_applicant_race_name_4',
 

In [174]:
scaler = StandardScaler()

In [175]:
running_X = []

In [176]:
index = column_names.index('minority_population')
index

72

In [177]:
raw_data = np.genfromtxt(data_path, delimiter='\t', dtype='str')

In [178]:
percent_minoritized = raw_data[:,index].astype('float').reshape(-1,1)

In [179]:
percent_minoritized = scaler.fit_transform(percent_minoritized)

In [180]:
running_X.append(percent_minoritized)

In [181]:
loan_type_index = column_names.index('loan_type')

In [182]:
raw_data[:, loan_type_index]

array(['1', '1', '2', ..., '3', '1', '1'], dtype='<U81')

In [183]:
loan_type_encoder = OneHotEncoder()

In [184]:
loan_type_onehots = loan_type_encoder.fit_transform(raw_data[:, loan_type_index].reshape(-1,1))

In [185]:
loan_type_onehots = loan_type_onehots.todense()

In [186]:
running_X.append(loan_type_onehots)

In [187]:
# RACE
race = raw_data[:, column_names.index('applicant_race_1')]
race = loan_type_encoder.fit_transform(race.reshape(-1,1)).todense()

In [188]:
running_X.append(race)

In [189]:
sex_onehots = loan_type_encoder.fit_transform(raw_data[:, column_names.index('applicant_sex')].reshape(-1,1)).todense()

In [190]:
running_X.append(sex_onehots)

In [191]:
loan_amt = raw_data[:, column_names.index('loan_amount_000s')].astype('float').reshape(-1,1)
loan_amt = scaler.fit_transform(loan_amt)

In [192]:
running_X.append(loan_amt)

In [193]:
rl_score = raw_data[:, column_names.index('redlining_score\n')].astype('float').reshape(-1,1)

In [194]:
running_X.append(rl_score)

In [195]:
full_X = np.concatenate(running_X, 1)

In [196]:
full_X.shape

(3552, 16)

In [197]:
full_X[:,-1]

matrix([[0.50153497],
        [0.35456869],
        [0.        ],
        ...,
        [0.666     ],
        [0.        ],
        [0.        ]])

In [198]:
X = full_X[:, :-1]
y = full_X[:,-1]

In [199]:
l_model = LinearRegression()

In [200]:
l_model.fit(X, y)



In [206]:
import torch
from sklearn.model_selection import train_test_split

In [207]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=11)

In [217]:
batch_size = 64
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train.reshape(-1,1)).float())
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

In [218]:
model = torch.nn.Sequential(
    torch.nn.Linear(X.shape[1], 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 1),
    torch.nn.Sigmoid()
)

In [221]:
optimizer = torch.optim.Adam(model.parameters())

In [222]:
loss = torch.nn.MSELoss()

In [226]:
for i in range(100):
    for batch in train_loader:
        optimizer.zero_grad()

        batchX = batch[0]
        batchY = batch[1]

        preds = model(batchX)

        l = loss(preds, batchY)
        # print(loss)
        l.backward()
        optimizer.step()


In [228]:
model(torch.tensor(X_test).float())

tensor([[1.0192e-04],
        [1.2481e-01],
        [1.6514e-07],
        [3.0516e-02],
        [7.3884e-01],
        [2.2924e-02],
        [2.2366e-01],
        [1.1334e-01],
        [1.5390e-01],
        [4.5552e-01],
        [1.2913e-01],
        [9.7043e-02],
        [8.5871e-02],
        [2.4292e-02],
        [8.9357e-03],
        [4.0514e-01],
        [1.0819e-07],
        [1.4098e-01],
        [2.6956e-02],
        [1.3917e-01],
        [1.3847e-01],
        [1.0463e-01],
        [4.0053e-02],
        [1.5887e-07],
        [3.0381e-01],
        [4.9564e-09],
        [1.8603e-01],
        [1.2381e-01],
        [3.0977e-01],
        [1.0258e-01],
        [1.0810e-01],
        [4.5295e-01],
        [2.6914e-01],
        [3.7654e-01],
        [5.8001e-01],
        [7.7626e-05],
        [8.7319e-03],
        [1.3923e-01],
        [3.2555e-03],
        [1.8784e-01],
        [5.0196e-02],
        [8.6168e-02],
        [7.5871e-01],
        [1.2157e-01],
        [1.7063e-02],
        [6