In [1]:
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load data
file_path = Path("SQL_cleaning_resources/action_taken_summary.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,as_of_year,agency_code,loan_type,property_type,loan_purpose,owner_occupancy,loan_amount_000s,preapproval,action_taken,action_taken_summary,...,purchaser_type,denial_reason_1,hoepa_status,lien_status,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units
0,7/9/05,9,1,1,1,1,424,3,1,1.0,...,3,0,2,1,6585.0,70.16,75200.0,110.94,1352.0,1612.0
1,7/9/05,2,1,1,3,1,153,3,4,,...,0,0,2,1,8152.0,22.46,67300.0,96.86,2404.0,3345.0
2,7/9/05,9,1,1,3,1,56,3,3,0.0,...,0,3,2,1,7388.0,4.24,65500.0,120.1,2651.0,2999.0
3,7/9/05,7,1,1,3,1,207,3,1,1.0,...,1,0,2,1,6993.0,3.15,87200.0,107.98,2168.0,2732.0
4,7/9/05,7,1,1,1,1,424,3,4,,...,0,0,2,1,4775.0,67.64,51800.0,219.02,1305.0,1448.0


In [3]:
df.isnull().sum()

as_of_year                           0
agency_code                          0
loan_type                            0
property_type                        0
loan_purpose                         0
owner_occupancy                      0
loan_amount_000s                     0
preapproval                          0
action_taken                         0
action_taken_summary              3724
msamd                             2827
state_code                         325
county_code                        394
census_tract_number                457
applicant_ethnicity                  0
co_applicant_ethnicity               0
applicant_race_1                     0
co_applicant_race_1                  0
applicant_sex                        0
co_applicant_sex                     0
applicant_income_000s             3100
purchaser_type                       0
denial_reason_1                      0
hoepa_status                         0
lien_status                          0
population               

In [11]:
df2 = df.drop(columns=["action_taken", "as_of_year"])
df2 = df2.dropna()
df2.isnull().sum()

agency_code                       0
loan_type                         0
property_type                     0
loan_purpose                      0
owner_occupancy                   0
loan_amount_000s                  0
preapproval                       0
action_taken_summary              0
msamd                             0
state_code                        0
county_code                       0
census_tract_number               0
applicant_ethnicity               0
co_applicant_ethnicity            0
applicant_race_1                  0
co_applicant_race_1               0
applicant_sex                     0
co_applicant_sex                  0
applicant_income_000s             0
purchaser_type                    0
denial_reason_1                   0
hoepa_status                      0
lien_status                       0
population                        0
minority_population               0
hud_median_family_income          0
tract_to_msamd_income             0
number_of_owner_occupied_uni

In [12]:
# Define features and target
y = df2["action_taken_summary"]
X = df2.drop(columns="action_taken_summary")

In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

In [14]:
# Creating StandardScaler instance, fitting to X, and scaling
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Creating the decision tree classifier instance and fitting to data
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)

In [17]:
predictions = model.predict(X_test_scaled)

In [18]:
# Calculating confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

# Calculating accuracy score
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,596,121
Actual 1,113,2743


Accuracy Score: 0.9345088161209067
              precision    recall  f1-score   support

         0.0       0.84      0.83      0.84       717
         1.0       0.96      0.96      0.96      2856

    accuracy                           0.93      3573
   macro avg       0.90      0.90      0.90      3573
weighted avg       0.93      0.93      0.93      3573



In [20]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=21)

In [21]:
# Fitting model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [27]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [28]:
# Calculating confusion matrix
rf_cm = confusion_matrix(y_test, predictions)

rf_cm_df = pd.DataFrame(rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating accuracy score
rf_acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying Results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,598,119
Actual 1,74,2782


Accuracy Score : 0.9459837671424574
Classification Report
              precision    recall  f1-score   support

         0.0       0.89      0.83      0.86       717
         1.0       0.96      0.97      0.97      2856

    accuracy                           0.95      3573
   macro avg       0.92      0.90      0.91      3573
weighted avg       0.95      0.95      0.95      3573



In [31]:
# Sort features by importance
imp_list = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
imp_list

[(0.30988966258141765, 'denial_reason_1'),
 (0.25956982624713965, 'purchaser_type'),
 (0.050507882195858674, 'agency_code'),
 (0.03659933460537553, 'applicant_income_000s'),
 (0.032230681262797614, 'loan_amount_000s'),
 (0.02515164917266415, 'minority_population'),
 (0.023706289973303478, 'tract_to_msamd_income'),
 (0.02282621047258952, 'census_tract_number'),
 (0.020579157366010816, 'population'),
 (0.020406507335042148, 'number_of_1_to_4_family_units'),
 (0.020059741259851636, 'number_of_owner_occupied_units'),
 (0.019583211944576396, 'county_code'),
 (0.019545771849316173, 'msamd'),
 (0.019390107709036906, 'hud_median_family_income'),
 (0.017693578089985312, 'loan_purpose'),
 (0.01749867172417757, 'lien_status'),
 (0.016414433933114776, 'property_type'),
 (0.016373673265563077, 'state_code'),
 (0.01451498368871719, 'loan_type'),
 (0.006369974720950051, 'applicant_race_1'),
 (0.005513127971726035, 'applicant_sex'),
 (0.005302525900951376, 'applicant_ethnicity'),
 (0.00486018427023607