In [72]:
%matplotlib inline
import pickle as pk
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Load data
file_path = Path("Resources/pickled_raws.pkl")
df = pd.read_pickle(file_path)
df.head()

In [None]:
# Drop undesired columns
df = df.drop(columns=["action_taken", "purchaser_type", "denial_reason_1", "lien_status", "state_code", "hoepa_status"])

In [None]:
df.info()

In [None]:
# Remove rows containing N/A, incomplete, or irrelevant information
df = df.loc[df["loan_purpose"] == "Home Purchase"]
df = df.drop(columns="loan_purpose")

In [None]:
df = df.loc[df["owner_occupancy"] != "Not Applicable"]

In [None]:
df = df.loc[df["applicant_sex"] != "Information not Provided"]
df = df.loc[df["applicant_sex"] != "Not Applicable"]

In [None]:
df = df.loc[df["co_applicant_sex"] != "Information not Provided"]
df = df.loc[df["co_applicant_sex"] != "Not Applicable"]

In [None]:
df = df.loc[df["applicant_ethnicity"] != "Not Applicable"]
df = df.loc[df["applicant_ethnicity"] != "Info not Provided by Applicant"]

In [None]:
df = df.loc[df["co_applicant_ethnicity"] != "Not Applicable"]
df = df.loc[df["co_applicant_ethnicity"] != "Info not Provided by Applicant"]

In [None]:
df = df.loc[df["preapproval"] != "Not Applicable"]

In [None]:
# Remove outlier homes
df = df.loc[df["loan_amount_000s"] <= 1650]

In [None]:
# Restrict target variable to only approvals and denials
df = df.loc[df["action_taken_summary"] != 2]

In [None]:
df.count()

In [None]:
df.head()

In [None]:
# Drop index
df = df.reset_index(drop=True)

In [None]:
# Generate our categorical variable lists
action_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df[action_cat].nunique()

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[action_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(action_cat)
encode_df.head()

In [None]:
# Merge dummy columns and drop originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(action_cat, axis=1)
df.count()

In [None]:
df.columns

In [None]:
# Drop inverse columns for binary categories
df = df.drop(columns=["owner_occupancy_Not Owner Occupied", "applicant_sex_Male", "applicant_ethnicity_Not Hispanic or Latino", "preapproval_Preapproval Not Requested"])

In [None]:
# Define features and targets
y = df["action_taken_summary"]
X = df.drop(columns="action_taken_summary")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=21)

In [None]:
# Create StandardScaler instance, fit to X, scale
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
model = RandomForestClassifier(max_depth = 3, max_features = 3, n_estimators=100, class_weight = "balanced", random_state=21)

In [None]:
# Fit model
model = model.fit(X_train_scaled, y_train)

In [None]:
predictions = model.predict(X_test_scaled)

In [None]:
# Calculate confusion matrix and accuracy score
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index= ["Actual 0", "Actual 1"], columns = ["Predicted 0", "Predicted 1"])

acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print(classification_report(y_test, predictions))

In [None]:
# Sort features by importance
imp_list = sorted(zip(model.feature_importances_, X.columns), reverse=True)
imp_list

In [None]:
pd.Series(model.feature_importances_, index=X.columns).nlargest(10).plot(kind='barh')