In [7]:
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load data
file_path = Path("Resources/final_sample.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,agency_code,loan_type,property_type,loan_purpose,owner_occupancy,loan_amount_000s,preapproval,action_taken,applicant_ethnicity,co_applicant_ethnicity,...,hoepa_status,lien_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,Region,action_taken_summary
0,NCUA,Conventional,One to Four-Family,Refinancing,Not Owner Occupied,123.0,Not Applicable,Application Approved but not Accepted,Not Hispanic or Latino,Not Hispanic or Latino,...,Not a HOEPA Loan,Secured by First Lien,2310.0,20.35,65300.0,74.28,941.0,3848.0,MW,0
1,HUD,Conventional,One to Four-Family,Home Purchase,Owner Occupied,387.0,Not Applicable,Application Approved but not Accepted,Hispanic or Latino,Not Hispanic or Latino,...,Not a HOEPA Loan,Secured by First Lien,5192.0,96.51,64300.0,53.53,397.0,1056.0,W,0
2,HUD,Conventional,One to Four-Family,Refinancing,Owner Occupied,376.0,Not Applicable,Application Approved but not Accepted,Not Hispanic or Latino,Not Hispanic or Latino,...,Not a HOEPA Loan,Secured by First Lien,12130.0,66.65,63200.0,112.07,921.0,2099.0,W,0
3,HUD,FHA-Insured,One to Four-Family,Refinancing,Owner Occupied,272.0,Not Applicable,Application Approved but not Accepted,Not Hispanic or Latino,Not Hispanic or Latino,...,Not a HOEPA Loan,Secured by First Lien,6335.0,27.73,91500.0,136.37,1406.0,1541.0,MA,0
4,NCUA,Conventional,One to Four-Family,Home Improvement,Owner Occupied,40.0,Not Applicable,Application Approved but not Accepted,Info not Provided by Applicant,No Co-Applicant,...,Not a HOEPA Loan,Secured by a Subordinate Loan,5625.0,5.33,79600.0,153.58,1693.0,1679.0,MW,0


In [3]:
df.isnull().sum()

agency_code                       0
loan_type                         0
property_type                     0
loan_purpose                      0
owner_occupancy                   0
loan_amount_000s                  0
preapproval                       0
action_taken                      0
applicant_ethnicity               0
co_applicant_ethnicity            0
applicant_race_1                  0
co_applicant_race_1               0
applicant_sex                     0
co_applicant_sex                  0
applicant_income_000s             0
purchaser_type                    0
hoepa_status                      0
lien_status                       0
population                        0
minority_population               0
hud_median_family_income          0
tract_to_msamd_income             0
number_of_owner_occupied_units    0
number_of_1_to_4_family_units     0
Region                            0
action_taken_summary              0
dtype: int64

In [4]:
# Drop action_taken column
df = df.drop(columns=["action_taken"])

In [5]:
# Generate our categorical variable lists
action_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df[action_cat].nunique()

agency_code                6
loan_type                  4
property_type              2
loan_purpose               3
owner_occupancy            3
preapproval                3
applicant_ethnicity        4
co_applicant_ethnicity     5
applicant_race_1           7
co_applicant_race_1        8
applicant_sex              4
co_applicant_sex           5
purchaser_type            10
hoepa_status               2
lien_status                4
Region                     7
dtype: int64

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[action_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(action_cat)
encode_df.columns



Index(['agency_code_CFPB', 'agency_code_FDIC', 'agency_code_FRS',
       'agency_code_HUD', 'agency_code_NCUA', 'agency_code_OCC',
       'loan_type_Conventional', 'loan_type_FHA-Insured', 'loan_type_FSA/RHS',
       'loan_type_VA-Guaranteed', 'property_type_Manufactured',
       'property_type_One to Four-Family', 'loan_purpose_Home Improvement',
       'loan_purpose_Home Purchase', 'loan_purpose_Refinancing',
       'owner_occupancy_Not Applicable', 'owner_occupancy_Not Owner Occupied',
       'owner_occupancy_Owner Occupied', 'preapproval_Not Applicable',
       'preapproval_Preapproval Not Requested',
       'preapproval_Preapproval Requested',
       'applicant_ethnicity_Hispanic or Latino',
       'applicant_ethnicity_Info not Provided by Applicant',
       'applicant_ethnicity_Not Applicable',
       'applicant_ethnicity_Not Hispanic or Latino',
       'co_applicant_ethnicity_Hispanic or Latino',
       'co_applicant_ethnicity_Info not Provided by Applicant',
       'co_applican

In [10]:
###

# NEED TO DROP ALL N/A COLUMNS

###

In [17]:
# Merge dummy columns and drop originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.loc[:,~df.columns.duplicated()]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100725 entries, 0 to 100724
Columns: 240 entries, loan_amount_000s to Region_W
dtypes: float64(239), int64(1)
memory usage: 184.4 MB


In [18]:
# Define features and targets
y = df["action_taken_summary"]
X = df.drop(columns="action_taken_summary")

In [19]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

In [20]:
# Create StandardScaler instance, fit to X, scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
# Create decision tree classifier
model1 = tree.DecisionTreeClassifier()
model1 = model1.fit(X_train_scaled, y_train)

In [23]:
predictions = model1.predict(X_test_scaled)

In [24]:
# Calculate confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3793,1791
Actual 1,1808,17790


Accuracy Score: 0.8570804542927488
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      5584
           1       0.91      0.91      0.91     19598

    accuracy                           0.86     25182
   macro avg       0.79      0.79      0.79     25182
weighted avg       0.86      0.86      0.86     25182



In [26]:
# Create a random forest classifier
model2 = RandomForestClassifier(n_estimators=1000, random_state=21)

In [27]:
# Fit model
model2 = model2.fit(X_train_scaled, y_train)

In [29]:
predictions2 = model2.predict(X_test_scaled)

In [30]:
# Calculate confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions2)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions2)

In [31]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions2))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4149,1435
Actual 1,1341,18257


Accuracy Score: 0.8897625287904058
              precision    recall  f1-score   support

           0       0.76      0.74      0.75      5584
           1       0.93      0.93      0.93     19598

    accuracy                           0.89     25182
   macro avg       0.84      0.84      0.84     25182
weighted avg       0.89      0.89      0.89     25182



In [32]:
# Sort features by importance
imp_list = sorted(zip(model2.feature_importances_, X.columns), reverse=True)
imp_list

[(0.09525298470934059, 'purchaser_type_Loan Not Originated_x'),
 (0.09346883415996919, 'purchaser_type_Loan Not Originated_y'),
 (0.08727183584162009, 'purchaser_type_Loan Not Originated'),
 (0.0466193778080658, 'applicant_income_000s'),
 (0.039606021201398244, 'loan_amount_000s'),
 (0.035411922823232994, 'tract_to_msamd_income'),
 (0.0350633593945881, 'minority_population'),
 (0.03361268470708645, 'population'),
 (0.03360541282035773, 'number_of_owner_occupied_units'),
 (0.033514167209898006, 'number_of_1_to_4_family_units'),
 (0.030965360700094823, 'hud_median_family_income'),
 (0.01894135179317065, 'agency_code_HUD'),
 (0.017142273958399172, 'agency_code_HUD_x'),
 (0.016698597593311076, 'agency_code_HUD_y'),
 (0.011502205314293112, 'purchaser_type_FNMA_x'),
 (0.010451315380386551, 'purchaser_type_FNMA_y'),
 (0.009494245429982465, 'purchaser_type_FNMA'),
 (0.008585373978138574, 'loan_purpose_Home Purchase'),
 (0.007446652408190848, 'loan_purpose_Home Purchase_y'),
 (0.007070429702041