In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [21]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

# Read the CSV and Perform Basic Data Cleaning

In [4]:
# Load the data
encoded_file = Path('./Instructions/Resources/Loan_Stats_Encoded.csv')
encoded_df = pd.read_csv(encoded_file)

encoded_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,debt_settlement_flag,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,issue_d_Feb-2019,issue_d_Jan-2019,issue_d_Mar-2019,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019
0,10500.0,0.1719,375.35,66000.0,1,low_risk,0,27.24,0.0,0.0,...,0,0,0,0,1,0,0,1,0,1
1,25000.0,0.2,929.09,105000.0,2,low_risk,0,20.23,0.0,0.0,...,0,0,1,0,0,0,0,1,0,1
2,20000.0,0.2,529.88,56000.0,2,low_risk,0,24.26,0.0,0.0,...,0,0,1,0,0,0,0,1,0,1
3,10000.0,0.164,353.55,92000.0,2,low_risk,0,31.44,0.0,1.0,...,0,0,0,0,1,0,0,1,0,1
4,22000.0,0.1474,520.39,52000.0,0,low_risk,0,18.76,0.0,1.0,...,0,0,1,0,0,0,0,1,0,1


# Split the Data into Training and Testing

In [5]:
# Create our features
X = encoded_df.drop(columns="loan_status")

# Create our target
y = encoded_df["loan_status"].values.reshape(-1, 1)

In [6]:
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,open_acc,...,debt_settlement_flag,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,issue_d_Feb-2019,issue_d_Jan-2019,issue_d_Mar-2019,next_pymnt_d_Apr-2019,next_pymnt_d_May-2019
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,0.669994,0.0,21.778153,0.217766,0.497697,12.58734,...,0.0,0.009285,0.526309,0.106747,0.357659,0.371696,0.451066,0.177238,0.383161,0.616839
std,10277.34859,0.04813,288.062432,115580.0,0.719105,0.0,20.199244,0.718367,0.758122,6.022869,...,0.0,0.095914,0.499311,0.308793,0.479314,0.483261,0.497603,0.381873,0.486161,0.486161
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9000.0,0.0881,265.73,50000.0,0.0,0.0,13.89,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15000.0,0.118,404.56,73000.0,1.0,0.0,19.76,0.0,0.0,11.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,24000.0,0.1557,648.1,104000.0,1.0,0.0,26.66,0.0,1.0,16.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,2.0,0.0,999.0,18.0,5.0,72.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
y[:5]

array([['low_risk'],
       ['low_risk'],
       ['low_risk'],
       ['low_risk'],
       ['low_risk']], dtype=object)

In [8]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Scale the Features Data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble AdaBoost classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Print the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier onely, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [12]:
# Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brf_model = brf.fit(X_train_scaled, y_train)

In [13]:
y_pred_brf = brf.predict(X_test_scaled)

In [14]:
# Calculated the balanced accuracy score
bac_brf = balanced_accuracy_score(y_test, y_pred_brf)

print(f"The Balanced Accuracy Score for the Balanced Random Forest Classifier is: {bac_brf}")

The Balanced Accuracy Score for the Balanced Random Forest Classifier is: 0.7866453565375247


In [15]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brf)

array([[   68,    33],
       [ 1710, 15394]], dtype=int64)

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.67      0.90      0.07      0.78      0.59       101
   low_risk       1.00      0.90      0.67      0.95      0.78      0.62     17104

avg / total       0.99      0.90      0.67      0.94      0.78      0.62     17205



In [20]:
# List the features sorted in descending order by feature importance
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.07560249426466197, 'total_rec_prncp'),
 (0.06924600991065172, 'total_pymnt_inv'),
 (0.06570223143777987, 'total_pymnt'),
 (0.06255445304910133, 'last_pymnt_amnt'),
 (0.05449363174821038, 'total_rec_int'),
 (0.03604756294400813, 'int_rate'),
 (0.02791019980305406, 'issue_d_Jan-2019'),
 (0.020157917539742828, 'installment'),
 (0.018100538209977916, 'dti'),
 (0.01656232946937654, 'max_bal_bc'),
 (0.016396210240536016, 'out_prncp_inv'),
 (0.014976619582856683, 'out_prncp'),
 (0.014929607722183514, 'issue_d_Mar-2019'),
 (0.01463413669203754, 'annual_inc'),
 (0.014621735727840694, 'total_bal_ex_mort'),
 (0.01441357820432573, 'mths_since_recent_inq'),
 (0.014382673833780052, 'tot_hi_cred_lim'),
 (0.014059401051987249, 'il_util'),
 (0.013723442569092066, 'total_rev_hi_lim'),
 (0.013512072591084157, 'mo_sin_old_il_acct'),
 (0.013411921556405628, 'total_bc_limit'),
 (0.013345757767477618, 'avg_cur_bal'),
 (0.013048984888385653, 'loan_amnt'),
 (0.01288177100829073, 'bc_util'),
 (0.01283922968

### Easy Ensemble AdaBoost Classifier

In [22]:
# Train the Classifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)

eec_model = eec.fit(X_train_scaled, y_train)

In [23]:
y_pred_eec = eec.predict(X_test_scaled)

In [24]:
# Calculated the balanced accuracy score
bac_eec = balanced_accuracy_score(y_test, y_pred_eec)

print(f"The Balanced Accuracy Score for the Balanced Random Forest Classifier is: {bac_eec}")

The Balanced Accuracy Score for the Balanced Random Forest Classifier is: 0.9328878543841288


In [25]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_eec)

array([[   93,     8],
       [  941, 16163]], dtype=int64)

In [26]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.09      0.92      0.94      0.16      0.93      0.87       101
   low_risk       1.00      0.94      0.92      0.97      0.93      0.87     17104

avg / total       0.99      0.94      0.92      0.97      0.93      0.87     17205

