In [20]:
%matplotlib inline
import pickle as pk
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

In [21]:
# Load data
file_path = Path("Resources/final_pickle.pkl")
df = pd.read_pickle(file_path)
df = df.drop(columns=["county_code", "census_tract_number"])
df.head()

Unnamed: 0,agency_code,loan_type,property_type,owner_occupancy,loan_amount_000s,preapproval,msamd,applicant_ethnicity,co_applicant_ethnicity,applicant_race_1,...,co_applicant_sex,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,action_taken_summary,Region
0,CFPB,FHA-Insured,One to Four-Family,Owner Occupied,92.0,Preapproval Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,Black or African American,...,No Co-Applicant,29.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
1,HUD,VA-Guaranteed,One to Four-Family,Owner Occupied,94.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,53.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
2,HUD,FHA-Insured,One to Four-Family,Owner Occupied,147.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,Not Hispanic or Latino,White,...,Female,90.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE
3,HUD,FHA-Insured,One to Four-Family,Owner Occupied,115.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,30.0,1948.0,12.58,59700.0,122.93,507.0,724.0,0,SE
4,HUD,VA-Guaranteed,One to Four-Family,Owner Occupied,160.0,Preapproval Not Requested,33860.0,Not Hispanic or Latino,No Co-Applicant,White,...,No Co-Applicant,50.0,1948.0,12.58,59700.0,122.93,507.0,724.0,1,SE


In [22]:
df.columns

Index(['agency_code', 'loan_type', 'property_type', 'owner_occupancy',
       'loan_amount_000s', 'preapproval', 'msamd', 'applicant_ethnicity',
       'co_applicant_ethnicity', 'applicant_race_1', 'co_applicant_race_1',
       'applicant_sex', 'co_applicant_sex', 'applicant_income_000s',
       'population', 'minority_population', 'hud_median_family_income',
       'tract_to_msamd_income', 'number_of_owner_occupied_units',
       'number_of_1_to_4_family_units', 'action_taken_summary', 'Region'],
      dtype='object')

In [23]:
# Generate our categorical variable lists
action_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df[action_cat].nunique()

agency_code               6
loan_type                 4
property_type             3
owner_occupancy           2
preapproval               2
applicant_ethnicity       2
co_applicant_ethnicity    3
applicant_race_1          6
co_applicant_race_1       7
applicant_sex             2
co_applicant_sex          3
Region                    7
dtype: int64

In [24]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[action_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(action_cat)
encode_df.head()



Unnamed: 0,agency_code_CFPB,agency_code_FDIC,agency_code_FRS,agency_code_HUD,agency_code_NCUA,agency_code_OCC,loan_type_Conventional,loan_type_FHA-Insured,loan_type_FSA/RHS,loan_type_VA-Guaranteed,...,co_applicant_sex_Female,co_applicant_sex_Male,co_applicant_sex_No Co-Applicant,Region_MA,Region_MW,Region_NE,Region_NW,Region_SE,Region_SW,Region_W
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
# Merge dummy columns and drop originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(action_cat, axis=1)
df.count()

loan_amount_000s                                                 1700356
msamd                                                            1700356
applicant_income_000s                                            1700356
population                                                       1700356
minority_population                                              1700356
hud_median_family_income                                         1700356
tract_to_msamd_income                                            1700356
number_of_owner_occupied_units                                   1700356
number_of_1_to_4_family_units                                    1700356
action_taken_summary                                             1700356
agency_code_CFPB                                                 1700356
agency_code_FDIC                                                 1700356
agency_code_FRS                                                  1700356
agency_code_HUD                                    

In [26]:
# Drop inverse columns for binary categories
df = df.drop(columns=["owner_occupancy_Not Owner Occupied", "applicant_sex_Male", "applicant_ethnicity_Not Hispanic or Latino", "preapproval_Preapproval Not Requested"])

In [27]:
# Define features and targets
y = df["action_taken_summary"]
X = df.drop(columns="action_taken_summary")

In [28]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=21)

In [29]:
# Create StandardScaler instance, fit to X, scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Create and apply a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)
X_train_scaled_resampled, y_train_resampled = rus.fit_resample(X_train_scaled, y_train)

In [31]:
# Create a random forest classifier
model = RandomForestClassifier(max_features = 8, n_estimators=100, random_state=21)

In [32]:
# Fit model
model = model.fit(X_train_scaled_resampled, y_train_resampled)

In [33]:
predictions = model.predict(X_test_scaled)

In [34]:
# Calculate confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [35]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33336,22440
Actual 1,144976,224337


Accuracy Score: 0.6061624742112828
              precision    recall  f1-score   support

           0       0.19      0.60      0.28     55776
           1       0.91      0.61      0.73    369313

    accuracy                           0.61    425089
   macro avg       0.55      0.60      0.51    425089
weighted avg       0.81      0.61      0.67    425089



In [36]:
# Sort features by importance
imp_list = sorted(zip(model.feature_importances_, X.columns), reverse=True)
imp_list

[(0.12000048393764576, 'applicant_income_000s'),
 (0.11979985632189868, 'loan_amount_000s'),
 (0.09497165306130258, 'minority_population'),
 (0.09416987708994332, 'tract_to_msamd_income'),
 (0.09034281890865958, 'population'),
 (0.08990517712888153, 'number_of_owner_occupied_units'),
 (0.08978284478230675, 'number_of_1_to_4_family_units'),
 (0.06456434618613452, 'hud_median_family_income'),
 (0.0583127291140913, 'msamd'),
 (0.015388944782636717, 'applicant_sex_Female'),
 (0.012237435249731986, 'preapproval_Preapproval Requested'),
 (0.007941334225199197, 'agency_code_HUD'),
 (0.007741475573889319, 'loan_type_Conventional'),
 (0.0066379798914231744, 'Region_SE'),
 (0.006452475769794819, 'applicant_ethnicity_Hispanic or Latino'),
 (0.006437911689400588, 'owner_occupancy_Owner Occupied'),
 (0.006374922853518869, 'loan_type_FHA-Insured'),
 (0.0063512376030717185, 'applicant_race_1_White'),
 (0.006034128747169506, 'Region_MW'),
 (0.005970546673994602, 'Region_SW'),
 (0.005814130488684473, '

In [37]:
# pd.Series(model.feature_importances_, index=X.columns).nlargest(10).plot(kind='barh').get_figure().savefig("Resources/importances8f_st_rus.png")