In [70]:
%matplotlib inline
import pandas as pd
import numpy as np 
from path import Path 
import matplotlib.pyplot as plt 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import EasyEnsembleClassifier



In [44]:
#set path and load dataframe for input and target dfs
sml_df_file_path = Path('./analysis/sml_df.csv')
sml_df = pd.read_csv(sml_df_file_path)

target_df_file_path = Path('./analysis/target_df.csv')
target_df = pd.read_csv(target_df_file_path)

In [45]:
sml_df.head()

Unnamed: 0,action_taken,loan_amount_000s,msamd,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,loan_type_name,property_type_name,loan_purpose_name,owner_occupancy_name,preapproval_name,applicant_ethnicity_name,co_applicant_ethnicity_name,purchaser_type_name
0,3,570.0,40900.0,144.0,4824.0,37.23,75200.0,57.419998,818.0,1626.0,Conventional,One-to-four family dwelling (other than manufa...,Home improvement,Owner-occupied as a principal dwelling,Not applicable,Not Hispanic or Latino,Not Hispanic or Latino,Loan was not originated or was not sold in cal...
1,3,185.0,40140.0,51.0,7404.0,57.52,63200.0,116.010002,1215.0,1743.0,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Not owner-occupied as a principal dwelling,Not applicable,Not Hispanic or Latino,No co-applicant,Loan was not originated or was not sold in cal...
2,1,1079.0,36084.0,278.0,3372.0,33.189999,97400.0,141.740005,592.0,1105.0,Conventional,One-to-four family dwelling (other than manufa...,Home purchase,Owner-occupied as a principal dwelling,Not applicable,"Information not provided by applicant in mail,...","Information not provided by applicant in mail,...",Loan was not originated or was not sold in cal...
3,3,417.0,36084.0,125.0,8787.0,65.129997,97400.0,97.269997,1463.0,2164.0,Conventional,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,Not applicable,Not Hispanic or Latino,Not Hispanic or Latino,Loan was not originated or was not sold in cal...
4,2,379.0,40900.0,86.0,5356.0,23.1,75200.0,126.690002,1711.0,2102.0,FHA-insured,One-to-four family dwelling (other than manufa...,Refinancing,Owner-occupied as a principal dwelling,Not applicable,Not Hispanic or Latino,No co-applicant,Loan was not originated or was not sold in cal...


In [46]:
sml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1477033 entries, 0 to 1477032
Data columns (total 18 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   action_taken                    1477033 non-null  int64  
 1   loan_amount_000s                1477033 non-null  float64
 2   msamd                           1477033 non-null  float64
 3   applicant_income_000s           1477033 non-null  float64
 4   population                      1477033 non-null  float64
 5   minority_population             1477033 non-null  float64
 6   hud_median_family_income        1477033 non-null  float64
 7   tract_to_msamd_income           1477033 non-null  float64
 8   number_of_owner_occupied_units  1477033 non-null  float64
 9   number_of_1_to_4_family_units   1477033 non-null  float64
 10  loan_type_name                  1477033 non-null  object 
 11  property_type_name              1477033 non-null  object 
 12  

In [47]:
# define clean target_df 
def action_taken_binary(member):
    if member == 1:
        return 0
    else:
        return 1

In [48]:
sml_df.action_taken = sml_df.action_taken.apply(action_taken_binary)

In [49]:
# Create features
X = sml_df.copy()
X = X.drop(columns='action_taken')
X = pd.get_dummies(X)

# Create target
y = sml_df[['action_taken']]


In [50]:
X.describe()

Unnamed: 0,loan_amount_000s,msamd,applicant_income_000s,population,minority_population,hud_median_family_income,tract_to_msamd_income,number_of_owner_occupied_units,number_of_1_to_4_family_units,loan_type_name_Conventional,...,purchaser_type_name_Affiliate institution,"purchaser_type_name_Commercial bank, savings bank or savings association",purchaser_type_name_Fannie Mae (FNMA),purchaser_type_name_Farmer Mac (FAMC),purchaser_type_name_Freddie Mac (FHLMC),purchaser_type_name_Ginnie Mae (GNMA),"purchaser_type_name_Life insurance company, credit union, mortgage bank, or finance company",purchaser_type_name_Loan was not originated or was not sold in calendar year covered by register,purchaser_type_name_Other type of purchaser,purchaser_type_name_Private securitization
count,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,...,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0,1477033.0
mean,399.9415,34935.38,144.5669,5873.621,55.50147,75492.68,118.2309,1212.891,1732.202,0.8086509,...,0.007284874,0.05798313,0.1565686,1.354066e-05,0.09853402,0.0622613,0.05148971,0.5349054,0.02053509,0.01042428
std,901.4832,9458.675,462.2419,2891.405,24.61725,17054.02,48.82625,688.5472,873.4476,0.3933634,...,0.08504005,0.2337116,0.3633937,0.00367974,0.2980354,0.2416296,0.2209945,0.4987803,0.1418218,0.1015658
min,1.0,11244.0,1.0,0.0,0.0,47300.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,219.0,31084.0,68.0,4130.0,34.9,63200.0,83.07,752.0,1162.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,336.0,40140.0,103.0,5360.0,54.06,73900.0,111.22,1097.0,1576.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,480.0,41740.0,160.0,6848.0,76.67,85600.0,145.67,1522.0,2106.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,475000.0,49700.0,442000.0,39454.0,100.0,131500.0,398.7,5229.0,7379.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
y.value_counts()

action_taken
0               810866
1               666167
dtype: int64

In [52]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
y_test.shape

(369259, 1)

In [53]:
# scale the model
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [59]:
# SMOTE oversampling
smt = SMOTE(random_state=1)
X_resampled, y_resampled = smt.fit_resample(X_train, y_train)
y_resampled = y_resampled.ravel()
Counter(y_resampled)


KeyboardInterrupt: 

In [61]:
y_resampled = y_resampled['action_taken'].ravel()

In [62]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)

In [64]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,154006,48820
Actual 1,19274,147159


In [65]:
# Print the imbalanced classification report
print('Imbalanced Classification Report - SMOTE Oversampling')
print(classification_report_imbalanced(y_test, y_pred))

Imbalanced Classification Report - SMOTE Oversampling
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.76      0.88      0.82      0.82      0.66    202826
          1       0.75      0.88      0.76      0.81      0.82      0.68    166433

avg / total       0.83      0.82      0.83      0.82      0.82      0.67    369259



In [68]:
# EASY ENSEMBLE ADABOOST CLASSIFIER 
eec_model = EasyEnsembleClassifier()
y_train = y_train['action_taken'].ravel()
eec_model.fit(X_train, y_train)
y_pred = eec_model.predict(X_test)
Counter(y_pred)

Counter({1: 188000, 0: 181259})

In [71]:
# Calculated the balanced accuracy score
print('Balanced Accuracy Score - Easy Ensemble AdaBoost Classifier')
print(balanced_accuracy_score(y_test, y_pred))

Balanced Accuracy Score - Easy Ensemble AdaBoost Classifier
0.8236250364422771


In [72]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
print('Confusion Matrix - Easy Ensemble AdaBoost Classifier')
cm_df.head()

Confusion Matrix - Easy Ensemble AdaBoost Classifier


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,158732,44094
Actual 1,22527,143906


In [73]:
# Print the imbalanced classification report
print('Imbalanced Classification Report - Easy Ensemble AdaBoost Classifier')
print(classification_report_imbalanced(y_test, y_pred))

Imbalanced Classification Report - Easy Ensemble AdaBoost Classifier
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.78      0.86      0.83      0.82      0.67    202826
          1       0.77      0.86      0.78      0.81      0.82      0.68    166433

avg / total       0.83      0.82      0.83      0.82      0.82      0.68    369259

