In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.ensemble import EasyEnsembleClassifier

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
path = Path('./data/numeric_database.csv')
df = pd.read_csv(path, low_memory=False)

In [4]:
def fix_target_column(member):
    if member == 1:
        return 0
    else: 
        return 1


In [5]:
df['action_taken'] = df['action_taken'].apply(fix_target_column)

In [6]:
df['action_taken'].value_counts()

0    828891
1    680132
Name: action_taken, dtype: int64

In [7]:
X = df.copy()
X = X.drop(columns=['action_taken','sequence_number'], axis=1)
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509023 entries, 0 to 1509022
Data columns (total 21 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   population                      1509023 non-null  float64
 1   minority_population             1509023 non-null  float64
 2   hud_median_family_income        1509023 non-null  float64
 3   tract_to_msamd_income           1509023 non-null  float64
 4   number_of_owner_occupied_units  1509023 non-null  float64
 5   number_of_1_to_4_family_units   1509023 non-null  float64
 6   applicant_sex                   1509023 non-null  int64  
 7   co_applicant_sex                1509023 non-null  int64  
 8   applicant_ethnicity             1509023 non-null  int64  
 9   co_applicant_ethnicity          1509023 non-null  int64  
 10  applicant_race_1                1509023 non-null  int64  
 11  co_applicant_race_1             1509023 non-null  int64  
 12  

In [8]:
y = df['action_taken'].ravel()

In [9]:
#scale data
scaler = StandardScaler()
X_scaler = scaler.fit(X)
X_scaled_df = X_scaler.transform(X)

In [10]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, random_state=3, stratify=y)

In [11]:
# Easy Ensemble Classifier 
eec_model = EasyEnsembleClassifier()
eec_model.fit(X_train, y_train)

y_pred = eec_model.predict(X_test)

In [12]:
#balanced accuracy score
print(f'Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier')
print(balanced_accuracy_score(y_test, y_pred))

0.828865300368754


In [13]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
print(f'Confusion Matrix: Easy Ensemble AdaBoost Classifier')
cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,165205,42018
Actual 1,23720,146313


In [14]:
print(f'Imbalanced Classification Report: Balanced Random Forest Classifier - REDUCED')
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.87      0.80      0.86      0.83      0.83      0.68    207223
          1       0.78      0.86      0.80      0.82      0.83      0.69    170033

avg / total       0.83      0.83      0.83      0.83      0.83      0.69    377256



In [15]:
X_NH = X.copy()
X_NH = X_NH.drop(columns='hoepa_status', axis=1)

In [16]:
X_train_NH, X_test_NH, y_train_NH, y_test_NH = train_test_split(X_NH, y, random_state=3, stratify=y)

In [17]:
eec_model.fit(X_train_NH, y_train_NH)

y_pred_NH = eec_model.predict(X_test_NH)

In [18]:
print(f'Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier - NO HOEPA STATUS')
print(balanced_accuracy_score(y_test_NH, y_pred_NH))

Balanced Accuracy Score: Easy Ensemble AdaBoost Classifier - NO HOEPA STATUS
0.8286977318525881


In [19]:
cm_NH = confusion_matrix(y_test_NH, y_pred_NH)
cm_df_NH = pd.DataFrame(cm_NH, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
print(f'Confusion Matrix: Easy Ensemble AdaBoost Classifier - NO HOEPA STAT')
cm_df_NH.head()

Confusion Matrix: Easy Ensemble AdaBoost Classifier - NO HOEPA STAT


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,164670,42553
Actual 1,23338,146695


In [21]:
print(f'Imbalanced Classification Report: Balanced Random Forest Classifier - REDUCED')
print(classification_report_imbalanced(y_test_NH,y_pred_NH))

Imbalanced Classification Report: Balanced Random Forest Classifier - REDUCED
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.79      0.86      0.83      0.83      0.68    207223
          1       0.78      0.86      0.79      0.82      0.83      0.69    170033

avg / total       0.83      0.83      0.83      0.83      0.83      0.69    377256

