In [47]:
import pickle as pk
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

In [48]:
# Load data
file_path = Path("Resources/final_pickle.pkl")
df = pd.read_pickle(file_path)
df = df.drop(columns=["county_code", "census_tract_number"])
df.head()

Unnamed: 0,agency_code,loan_type,property_type,owner_occupancy,loan_amount_000s,preapproval,msamd,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,...,co_applicant_sex,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,action_taken_summary,Region
0,CFPB,FHA-Insured,One to Four-Family,Owner Occupied,92.0,Preapproval Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,Black or African American,...,No Co-Applicant,29.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
1,HUD,VA-Guaranteed,One to Four-Family,Owner Occupied,94.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,53.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
2,HUD,FHA-Insured,One to Four-Family,Owner Occupied,147.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,Not Hispanic or Latino,White,...,Female,90.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
3,HUD,FHA-Insured,One to Four-Family,Owner Occupied,115.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,30.0,1948.0,12.58,59700.0,122.93,507.0,724.0,0,SE
4,HUD,VA-Guaranteed,One to Four-Family,Owner Occupied,160.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,50.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE


In [49]:
# Generate our categorical variable lists
action_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df[action_cat].nunique()

agency_code               6
loan_type                 4
property_type             3
owner_occupancy           2
preapproval               2
applicant_ethnicity       2
co_applicant_ethnicity    3
applicant_race_1          6
co_applicant_race_1       7
applicant_sex             2
co_applicant_sex          3
Region                    7
dtype: int64

In [50]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[action_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(action_cat)
encode_df.head()



Unnamed: 0,agency_code_CFPB,agency_code_FDIC,agency_code_FRS,agency_code_HUD,agency_code_NCUA,agency_code_OCC,loan_type_Conventional,loan_type_FHA-Insured,loan_type_FSA/RHS,loan_type_VA-Guaranteed,...,co_applicant_sex_Female,co_applicant_sex_Male,co_applicant_sex_No Co-Applicant,Region_MA,Region_MW,Region_NE,Region_NW,Region_SE,Region_SW,Region_W
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [51]:
# Merge dummy columns and drop originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(action_cat, axis=1)
df.count()

loan_amount_000s                                                 1700356
msamd                                                            1700356
applicant_income_000s                                            1700356
population                                                       1700356
minority_population                                              1700356
hud_median_family_income                                         1700356
tract_to_msamd_income                                            1700356
number_of_owner_occupied_units                                   1700356
number_of_1_to_4_family_units                                    1700356
action_taken_summary                                             1700356
agency_code_CFPB                                                 1700356
agency_code_FDIC                                                 1700356
agency_code_FRS                                                  1700356
agency_code_HUD                                    

In [52]:
# Drop inverse columns for binary categories
df = df.drop(columns=["owner_occupancy_Not Owner Occupied", "applicant_sex_Male", "applicant_ethnicity_Not Hispanic or Latino", "preapproval_Preapproval Not Requested"])

In [53]:
# Define features and targets
y = df["action_taken_summary"]
X = df.drop(columns="action_taken_summary")

In [54]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

In [55]:
# Create StandardScaler instance, fit to X, scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [56]:
classifier = LogisticRegression(solver='lbfgs', max_iter=100, class_weight = "balanced", random_state=21)
classifier.fit(X_train_scaled, y_train)

LogisticRegression(class_weight='balanced', random_state=21)

In [57]:
predictions = classifier.predict(X_test_scaled)

In [58]:
# Calculate confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [59]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,30589,25202
Actual 1,138437,230861


Accuracy Score: 0.6150476723697861
              precision    recall  f1-score   support

           0       0.18      0.55      0.27     55791
           1       0.90      0.63      0.74    369298

    accuracy                           0.62    425089
   macro avg       0.54      0.59      0.51    425089
weighted avg       0.81      0.62      0.68    425089



In [60]:
coef_df = pd.DataFrame(zip(X_train.columns, np.squeeze(np.transpose(classifier.coef_))), columns=['feature', 'coefficent']) 

coef_df

Unnamed: 0,feature,coefficent
0,loan_amount_000s,-0.017834
1,msamd,0.03509
2,applicant_income_000s,0.058015
3,population,0.049136
4,minority_population,-0.057646
5,hud_median_family_income,0.111514
6,tract_to_msamd_income,0.06088
7,number_of_owner_occupied_units,0.051095
8,number_of_1_to_4_family_units,-0.054437
9,agency_code_CFPB,-0.094331


In [61]:
coef_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   feature     52 non-null     object 
 1   coefficent  52 non-null     float64
dtypes: float64(1), object(1)
memory usage: 960.0+ bytes


In [62]:
coef_df.sort_values(by="feature")

Unnamed: 0,feature,coefficent
45,Region_MA,-0.054209
46,Region_MW,0.038077
47,Region_NE,-0.039706
48,Region_NW,0.036211
49,Region_SE,-0.022004
50,Region_SW,0.030883
51,Region_W,-0.004333
9,agency_code_CFPB,-0.094331
10,agency_code_FDIC,0.014474
11,agency_code_FRS,0.069093


In [63]:
sorted_list = sorted(zip(X_train.columns, np.squeeze(np.transpose(classifier.coef_))))
sorted_list

[('Region_MA', -0.05420901704597905),
 ('Region_MW', 0.03807667459531207),
 ('Region_NE', -0.039705992182215856),
 ('Region_NW', 0.036211417722230396),
 ('Region_SE', -0.022003991450284487),
 ('Region_SW', 0.030882755586740758),
 ('Region_W', -0.0043332841344851936),
 ('agency_code_CFPB', -0.09433074624610177),
 ('agency_code_FDIC', 0.014474279424126676),
 ('agency_code_FRS', 0.06909319077558577),
 ('agency_code_HUD', 0.0696727784348255),
 ('agency_code_NCUA', -0.0891034785073702),
 ('agency_code_OCC', 0.01620012663664437),
 ('applicant_ethnicity_Hispanic or Latino', -0.05952963419072328),
 ('applicant_income_000s', 0.058014725912785524),
 ('applicant_race_1_American Indian or Alaska Native', -0.004537324937039528),
 ('applicant_race_1_Asian', 0.005314025509952996),
 ('applicant_race_1_Black or African American', -0.08553498388771179),
 ('applicant_race_1_Information not Provided', -0.04646020518911452),
 ('applicant_race_1_Native Hawaiian or Other Pacific Islander',
  -0.0002126107574