# Ensemble Learning

## Initial Imports

In [39]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [41]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

## Read the CSV and Perform Basic Data Cleaning

In [42]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
df = pd.read_csv(file_path)

# Preview the data
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [43]:
# List the columns
df.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'dti',
       'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal',
       'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv',
       'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'next_pymnt_d', 'collections_12_mths_ex_med',
       'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m',
       'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util',
       'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
       'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_si

In [44]:
# DataFrame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   home_ownership              68817 non-null  object 
 4   annual_inc                  68817 non-null  float64
 5   verification_status         68817 non-null  object 
 6   issue_d                     68817 non-null  object 
 7   loan_status                 68817 non-null  object 
 8   pymnt_plan                  68817 non-null  object 
 9   dti                         68817 non-null  float64
 10  delinq_2yrs                 68817 non-null  float64
 11  inq_last_6mths              68817 non-null  float64
 12  open_acc                    68817 non-null  float64
 13  pub_rec                     688

In [45]:
# List column home_ownership
df["home_ownership"]

0            RENT
1        MORTGAGE
2        MORTGAGE
3            RENT
4        MORTGAGE
           ...   
68812        RENT
68813        RENT
68814    MORTGAGE
68815    MORTGAGE
68816    MORTGAGE
Name: home_ownership, Length: 68817, dtype: object

In [46]:
# List column verification_status
df["verification_status"]

0        Source Verified
1               Verified
2               Verified
3               Verified
4           Not Verified
              ...       
68812    Source Verified
68813       Not Verified
68814    Source Verified
68815           Verified
68816           Verified
Name: verification_status, Length: 68817, dtype: object

In [47]:
# List column issue_d
df["issue_d"]

0        Mar-2019
1        Mar-2019
2        Mar-2019
3        Mar-2019
4        Mar-2019
           ...   
68812    Jan-2019
68813    Jan-2019
68814    Jan-2019
68815    Jan-2019
68816    Jan-2019
Name: issue_d, Length: 68817, dtype: object

In [48]:
# List column loan_status
df["loan_status"]

0        low_risk
1        low_risk
2        low_risk
3        low_risk
4        low_risk
           ...   
68812    low_risk
68813    low_risk
68814    low_risk
68815    low_risk
68816    low_risk
Name: loan_status, Length: 68817, dtype: object

In [49]:
# List column pymnt_plan
df["pymnt_plan"]

0        n
1        n
2        n
3        n
4        n
        ..
68812    n
68813    n
68814    n
68815    n
68816    n
Name: pymnt_plan, Length: 68817, dtype: object

In [50]:
# List column initial_list_status
df["initial_list_status"]

0        w
1        w
2        w
3        w
4        w
        ..
68812    w
68813    w
68814    w
68815    f
68816    w
Name: initial_list_status, Length: 68817, dtype: object

In [51]:
# List column next_pymnt_d
df["next_pymnt_d"]

0        May-2019
1        May-2019
2        May-2019
3        May-2019
4        May-2019
           ...   
68812    May-2019
68813    May-2019
68814    May-2019
68815    May-2019
68816    May-2019
Name: next_pymnt_d, Length: 68817, dtype: object

In [52]:
# List column application_type
df["application_type"]

0        Individual
1        Individual
2        Individual
3        Individual
4        Individual
            ...    
68812    Individual
68813    Individual
68814    Individual
68815    Individual
68816    Individual
Name: application_type, Length: 68817, dtype: object

In [53]:
# List column hardship_flag
df["hardship_flag"]

0        N
1        N
2        N
3        N
4        N
        ..
68812    N
68813    N
68814    N
68815    N
68816    N
Name: hardship_flag, Length: 68817, dtype: object

In [54]:
# List column debt_settlement_flag
df["debt_settlement_flag"]

0        N
1        N
2        N
3        N
4        N
        ..
68812    N
68813    N
68814    N
68815    N
68816    N
Name: debt_settlement_flag, Length: 68817, dtype: object

In [55]:
# Drop columns with dates
df.drop(["issue_d", "next_pymnt_d" ], inplace=True, axis=1)
df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,loan_status,pymnt_plan,dti,delinq_2yrs,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,low_risk,n,27.24,0.0,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,low_risk,n,20.23,0.0,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,low_risk,n,24.26,0.0,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,low_risk,n,31.44,0.0,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,low_risk,n,18.76,0.0,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [56]:
# Use get dummies
df2 = pd.get_dummies(df, columns=["home_ownership", "verification_status", "pymnt_plan", "application_type", "initial_list_status", "hardship_flag", "debt_settlement_flag"])
df2.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,application_type_Individual,application_type_Joint App,initial_list_status_f,initial_list_status_w,hardship_flag_N,debt_settlement_flag_N
0,10500.0,0.1719,375.35,66000.0,low_risk,27.24,0.0,0.0,8.0,0.0,...,0,1,0,1,1,0,0,1,1,1
1,25000.0,0.2,929.09,105000.0,low_risk,20.23,0.0,0.0,17.0,1.0,...,0,0,1,1,1,0,0,1,1,1
2,20000.0,0.2,529.88,56000.0,low_risk,24.26,0.0,0.0,8.0,0.0,...,0,0,1,1,1,0,0,1,1,1
3,10000.0,0.164,353.55,92000.0,low_risk,31.44,0.0,1.0,10.0,1.0,...,0,0,1,1,1,0,0,1,1,1
4,22000.0,0.1474,520.39,52000.0,low_risk,18.76,0.0,1.0,14.0,0.0,...,1,0,0,1,1,0,0,1,1,1


In [57]:
# list info for df2
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 91 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            68817 non-null  float64
 1   int_rate                             68817 non-null  float64
 2   installment                          68817 non-null  float64
 3   annual_inc                           68817 non-null  float64
 4   loan_status                          68817 non-null  object 
 5   dti                                  68817 non-null  float64
 6   delinq_2yrs                          68817 non-null  float64
 7   inq_last_6mths                       68817 non-null  float64
 8   open_acc                             68817 non-null  float64
 9   pub_rec                              68817 non-null  float64
 10  revol_bal                            68817 non-null  float64
 11  total_acc                   

## Split the Data into Training and Testing

In [58]:
# Create our features
X = df2.copy()
X.drop("loan_status", axis=1, inplace=True)
X.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,application_type_Individual,application_type_Joint App,initial_list_status_f,initial_list_status_w,hardship_flag_N,debt_settlement_flag_N
0,10500.0,0.1719,375.35,66000.0,27.24,0.0,0.0,8.0,0.0,1609.0,...,0,1,0,1,1,0,0,1,1,1
1,25000.0,0.2,929.09,105000.0,20.23,0.0,0.0,17.0,1.0,18368.0,...,0,0,1,1,1,0,0,1,1,1
2,20000.0,0.2,529.88,56000.0,24.26,0.0,0.0,8.0,0.0,13247.0,...,0,0,1,1,1,0,0,1,1,1
3,10000.0,0.164,353.55,92000.0,31.44,0.0,1.0,10.0,1.0,17996.0,...,0,0,1,1,1,0,0,1,1,1
4,22000.0,0.1474,520.39,52000.0,18.76,0.0,1.0,14.0,0.0,9091.0,...,1,0,0,1,1,0,0,1,1,1


In [59]:
# describe X
X.describe()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,application_type_Individual,application_type_Joint App,initial_list_status_f,initial_list_status_w,hardship_flag_N,debt_settlement_flag_N
count,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,...,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0,68817.0
mean,16677.594562,0.127718,480.652863,88213.71,21.778153,0.217766,0.497697,12.58734,0.12603,17604.142828,...,0.478007,0.373992,0.148001,1.0,0.86034,0.13966,0.123879,0.876121,1.0,1.0
std,10277.34859,0.04813,288.062432,115580.0,20.199244,0.718367,0.758122,6.022869,0.336797,21835.8804,...,0.49952,0.483865,0.355104,0.0,0.346637,0.346637,0.329446,0.329446,0.0,0.0
min,1000.0,0.06,30.89,40.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,9000.0,0.0881,265.73,50000.0,13.89,0.0,0.0,8.0,0.0,6293.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
50%,15000.0,0.118,404.56,73000.0,19.76,0.0,0.0,11.0,0.0,12068.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
75%,24000.0,0.1557,648.1,104000.0,26.66,0.0,1.0,16.0,0.0,21735.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
max,40000.0,0.3084,1676.23,8797500.0,999.0,18.0,5.0,72.0,4.0,587191.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [60]:
# Create our target
y = df2["loan_status"]
y.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

In [61]:
# Check the balance of our target values
y.value_counts()

low_risk     68470
high_risk      347
Name: loan_status, dtype: int64

In [62]:
# Create X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1,
                                                    stratify=y
                                                   )
X_train.shape

(51612, 90)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [63]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [64]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [65]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [66]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brfc.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [67]:
# Calculated the balanced accuracy score
y_pred_brfc = brfc.predict(X_test_scaled)
bas_brfc = balanced_accuracy_score(y_test, y_pred_brfc)

In [68]:
# The confusion matrix
cm_brfc = confusion_matrix(y_test, y_pred_brfc)
cm_brfc_df = pd.DataFrame(
    cm_brfc, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"]
)

In [69]:
# The imbalanced classification report
cri_brfc = (classification_report_imbalanced(y_test, y_pred_brfc))

In [70]:
# List the features sorted in descending order by feature importance
# Get the feature importance array
importances = brfc.feature_importances_

# List the top 10 most important features
importances_sorted = sorted(zip(brfc.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.08201924196461595, 'total_rec_prncp'),
 (0.06303398109180933, 'last_pymnt_amnt'),
 (0.057469519822358366, 'total_rec_int'),
 (0.05601443488194615, 'total_pymnt_inv'),
 (0.04736200648261744, 'total_pymnt'),
 (0.02718177107333871, 'int_rate'),
 (0.02192798900805973, 'annual_inc'),
 (0.021444118545983253, 'dti'),
 (0.01967510006079375, 'installment'),
 (0.01932115010126018, 'all_util')]

In [71]:
# Displaying results
print("Confusion Matrix (Balanced Random Forest Classifier)")
display(cm_brfc_df)
print(f"Balanced Accuracy Score : {bas_brfc}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred_brfc))
print("The Top 10 Most Important Features")
print(importances_sorted[:10])

Confusion Matrix (Balanced Random Forest Classifier)


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,53,34
Actual low_risk,2773,14345


Balanced Accuracy Score : 0.7236010893957158
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.02      0.61      0.84      0.04      0.71      0.50        87
   low_risk       1.00      0.84      0.61      0.91      0.71      0.52     17118

avg / total       0.99      0.84      0.61      0.91      0.71      0.52     17205

The Top 10 Most Important Features
[(0.08201924196461595, 'total_rec_prncp'), (0.06303398109180933, 'last_pymnt_amnt'), (0.057469519822358366, 'total_rec_int'), (0.05601443488194615, 'total_pymnt_inv'), (0.04736200648261744, 'total_pymnt'), (0.02718177107333871, 'int_rate'), (0.02192798900805973, 'annual_inc'), (0.021444118545983253, 'dti'), (0.01967510006079375, 'installment'), (0.01932115010126018, 'all_util')]


### Easy Ensemble Classifier

In [72]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [73]:
# Calculated the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
bas_eec = balanced_accuracy_score(y_test, y_pred_eec)

In [74]:
# The confusion matrix
cm_eec = confusion_matrix(y_test, y_pred_eec)
cm_eec_df = pd.DataFrame(
    cm_eec, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"]
)

In [75]:
# The imbalanced classification report
cri_eec = (classification_report_imbalanced(y_test, y_pred_eec))

In [76]:
# Displaying results
print("Confusion Matrix (Easy Ensemble Classifier)")
display(cm_eec_df)
print(f"Balanced Accuracy Score : {bas_eec}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred_eec))

Confusion Matrix (Easy Ensemble Classifier)


Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,61,26
Actual low_risk,3640,13478


Balanced Accuracy Score : 0.7442538807707959
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.02      0.70      0.79      0.03      0.74      0.55        87
   low_risk       1.00      0.79      0.70      0.88      0.74      0.56     17118

avg / total       0.99      0.79      0.70      0.88      0.74      0.56     17205



### Final Questions

1. Which model had the best balanced accuracy score?

    Easy Ensemble Classifier (0.74)

2. Which model had the best recall score?

    Balanced Random Forest Classifier (0.84)

3. Which model had the best geometric mean score?

    Easy Ensemble Classifier (0.74)

4. What are the top three features?

 - (0.08201924196461595, total_rec_prncp')
 - (0.06303398109180933, 'last_pymnt_amnt')
 - (0.057469519822358366, 'total_rec_int')