In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
path = Path('./data/numeric_database.csv')
df = pd.read_csv(path, low_memory=False)

In [5]:
def fix_target_column(member):
    if member == 1:
        return 0
    else: 
        return 1


In [6]:
df['action_taken'] = df['action_taken'].apply(fix_target_column)

In [7]:
df['action_taken'].value_counts()

0    828891
1    680132
Name: action_taken, dtype: int64

In [8]:
X = df.copy()
X = X.drop(columns=['action_taken','sequence_number'], axis=1)
X.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509023 entries, 0 to 1509022
Data columns (total 21 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   population                      1509023 non-null  float64
 1   minority_population             1509023 non-null  float64
 2   hud_median_family_income        1509023 non-null  float64
 3   tract_to_msamd_income           1509023 non-null  float64
 4   number_of_owner_occupied_units  1509023 non-null  float64
 5   number_of_1_to_4_family_units   1509023 non-null  float64
 6   applicant_sex                   1509023 non-null  int64  
 7   co_applicant_sex                1509023 non-null  int64  
 8   applicant_ethnicity             1509023 non-null  int64  
 9   co_applicant_ethnicity          1509023 non-null  int64  
 10  applicant_race_1                1509023 non-null  int64  
 11  co_applicant_race_1             1509023 non-null  int64  
 12  

In [9]:
y = df['action_taken'].ravel()

In [10]:
#scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaler = scaler.fit(X)
X_scaled_df = X_scaler.transform(X)

In [11]:
from sklearn.model_selection import train_test_split
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, random_state=3, stratify=y)

In [18]:
# Easy Ensemble Classifier 
from imblearn.ensemble import EasyEnsembleClassifier
eec_model = EasyEnsembleClassifier()
eec_model.fit(X_train, y_train)

y_pred = eec_model.predict(X_test)

In [19]:
#balanced accuracy score

print(balanced_accuracy_score(y_test, y_pred))

0.8287392760686865


In [22]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,164647,42576
Actual 1,23305,146728


In [23]:
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.79      0.86      0.83      0.83      0.68    207223
          1       0.78      0.86      0.79      0.82      0.83      0.69    170033

avg / total       0.83      0.83      0.83      0.83      0.83      0.69    377256



In [12]:
# balanced random forest classifier 
from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier()
brfc.fit(X_train, y_train)
y_pred = brfc.predict(X_test)

In [17]:
print(f'Balanced Accuracy Score: Balanced Random Forest Classifier')
print(balanced_accuracy_score(y_test, y_pred))

Balanced Accuracy Score: Balanced Random Forest Classifier
0.8258958861151321


In [15]:
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0','Actual 1'], columns=['Predicted 0','Predicted 1'])
cm_df.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,167845,39378
Actual 1,26896,143137


In [16]:
print(classification_report_imbalanced(y_test,y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.86      0.81      0.84      0.84      0.83      0.68    207223
          1       0.78      0.84      0.81      0.81      0.83      0.68    170033

avg / total       0.83      0.82      0.83      0.82      0.83      0.68    377256



In [18]:
importances = brfc.feature_importances_
print('BRFC Column Importances')
sorted(zip(importances, X.columns), reverse=True)

BRFC Column Importances


[(0.403714828854716, 'purchaser_type'),
 (0.10023820371272817, 'loan_amount_000s'),
 (0.09702797425366244, 'applicant_income_000s'),
 (0.049970501201192966, 'tract_to_msamd_income'),
 (0.04884636748227927, 'minority_population'),
 (0.04653149460543755, 'population'),
 (0.046131734783810746, 'number_of_owner_occupied_units'),
 (0.04610597177973866, 'number_of_1_to_4_family_units'),
 (0.025322930103075864, 'applicant_race_1'),
 (0.0203805306286869, 'hud_median_family_income'),
 (0.019539225984863445, 'applicant_sex'),
 (0.019455708188232997, 'county_code'),
 (0.018587053237749195, 'applicant_ethnicity'),
 (0.01746359949145156, 'loan_purpose'),
 (0.009100557429951127, 'loan_type'),
 (0.009030066307430425, 'co_applicant_race_1'),
 (0.007566228414683993, 'co_applicant_sex'),
 (0.006748155508170132, 'co_applicant_ethnicity'),
 (0.005677244943304983, 'owner_occupancy'),
 (0.0023697479105204903, 'property_type'),
 (0.000191875178313258, 'hoepa_status')]