## **Model Types:** Explainable Boosting Machines (EBM) with grid search

### **Group 3** - Bethel Mandizha, Miranda Montenegro, Zeyu Wang

-----

### **Importing Necessary Libraries**

In [1]:
#Data manipulation, plotting and metrics
import numpy as np
import pandas as pd
import operator
import matplotlib.pyplot as plt
from sklearn import metrics
import seaborn as sns  
import datetime   
import time  

#Model modules
import itertools 
from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.perf import ROC 

#Setting numpy seed for better reproducibility
SEED = 12345 
np.random.seed(SEED)

#Setting number of threads
NTHREAD = 4


#### Starting the Global Timer

In [2]:
tic = time.time()

### **Importing Applicable Data**

In [3]:
#Importing pre-processed training data and showing first 5 lines of the dataset
train_data = pd.read_csv('/Users/Jing/Downloads/hmda_train_preprocessed.csv')
train_data.head()

Unnamed: 0,row_id,black,asian,white,amind,hipac,hispanic,non_hispanic,male,female,...,conforming,debt_to_income_ratio_missing,loan_amount_std,loan_to_value_ratio_std,no_intro_rate_period_std,intro_rate_period_std,property_value_std,income_std,debt_to_income_ratio_std,high_priced
0,0,,,,,,,,1.0,0.0,...,1,0,-0.514393,0.333922,0.244394,-0.215304,-0.535932,-0.040307,0.854601,0
1,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,...,1,0,-0.118642,0.268727,0.244394,-0.215304,-0.227585,-0.018133,-0.425131,0
2,2,,,,,,,,,,...,1,0,-0.778227,0.228996,-4.091747,4.610857,-0.720941,-0.032338,0.123326,0
3,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,...,1,0,-0.07467,-1.15024,0.244394,-0.215304,0.358276,-0.018133,-0.425131,0
4,4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,,,...,1,0,-0.602338,0.55252,0.244394,-0.215304,-0.628437,-0.038228,0.763191,0


In [4]:
#Importing pre-processed testing data and showing first 5 lines of the dataset
test_data = pd.read_csv('/Users/Jing/Downloads/hmda_test_preprocessed.csv')
test_data.head()

Unnamed: 0,row_id,black,asian,white,amind,hipac,hispanic,non_hispanic,male,female,...,term_360,conforming,debt_to_income_ratio_missing,loan_amount_std,loan_to_value_ratio_std,no_intro_rate_period_std,intro_rate_period_std,property_value_std,income_std,debt_to_income_ratio_std
0,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1,1,0,-0.514393,-0.039293,0.244394,-0.215304,-0.474263,-0.017786,0.854601
1,1,0.0,0.0,1.0,0.0,0.0,,,,,...,1,1,0,-0.162614,0.12683,0.244394,-0.215304,-0.227585,-0.015014,-0.425131
2,2,,,,,,1.0,0.0,,,...,1,1,0,-0.64631,0.55252,0.244394,-0.215304,-0.659271,-0.033378,-0.425131
3,3,,,,,,,,,,...,1,0,0,3.662982,0.133614,0.244394,-0.215304,2.763389,0.025177,0.306144
4,4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1,1,0,-0.338504,0.55252,0.244394,-0.215304,-0.443428,-0.027834,0.306144


#### Assigning modeling roles for the data

In [5]:
#Setting target and features for the data
target = 'high_priced'
demographic_cols = ['black', 'asian','white', 'amind', 'hipac', 'hispanic', 'non_hispanic', 'male', 'female', 'agegte62', 'agelt62']
features = ['intro_rate_period_std', 'debt_to_income_ratio_std', 'term_360', 'property_value_std', 'income_std', 'debt_to_income_ratio_missing']

print('target =', target)
print('predictors =', features)

target = high_priced
predictors = ['intro_rate_period_std', 'debt_to_income_ratio_std', 'term_360', 'property_value_std', 'income_std', 'debt_to_income_ratio_missing']


### **Fitting the Data**

In [6]:
#Preserve exact reproducibility for this cell
np.random.seed(SEED) 

#Choosing a 70/30 train/test split 
split_ratio = 0.7

#Execute split
split = np.random.rand(len(train_data)) < split_ratio
train = train_data[split]
valid = train_data[~split]

#Summarize split
print('Train data rows = %d, columns = %d' % (train.shape[0], train.shape[1]))
print('Validation data rows = %d, columns = %d' % (valid.shape[0], valid.shape[1]))

#Benchmark - Train data rows = 112253, columns = 23
#Benchmark - Validation data rows = 48085, columns = 23

Train data rows = 112253, columns = 23
Validation data rows = 48085, columns = 23


### **Setting up Model - EBM**

In [7]:
def ebm_grid(train, valid, features, target, gs_params=None, n_models=None, early_stopping_rounds=None, seed=None, cv=5):
    
    """ Performs a random grid search over n_models and gs_params.

    :param train: Training data as Pandas DataFrame.
    :param valid: Validation data as Pandas DataFrame.
    :param features: Names of input features.
    :param target: Name of target feature.
    :param gs_params: Dictionary of lists of potential EBM parameters over which to search.   
    :param n_models: Number of random models to evaluate.
    :param early_stopping_rounds: EBM early stopping rounds.
    :param seed: Random seed for better interpretability.
    :return: Best candidate model from random grid search.

    """

    #Cartesian product of gs_params
    keys, values = zip(*gs_params.items())
    experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]

    #Preserve exact reproducibility for this function
    np.random.seed(SEED) 
    
    #Select randomly from cartesian product space
    selected_experiments = np.random.choice(len(experiments), n_models)

    #Set global params for seed, etc.
    params = {'n_jobs': NTHREAD,
              'early_stopping_rounds': early_stopping_rounds, 
              'random_state': SEED}

    #Init grid search loop
    best_candidate = None
    best_score = 0

    #Grid search loop
    for i, exp in enumerate(selected_experiments):

        params.update(experiments[exp])  # override global params with current grid run params

        print('Grid search run %d/%d:' % (int(i + 1), int(n_models)))
        print('Training with parameters:', params)
        
        #Train 
        ebm = ExplainableBoostingClassifier(**params)
        candidate = ebm.fit(train[features], train[target]) 
        
        #Calculate AUC
        ebm_perf = ROC(ebm.predict_proba).explain_perf(valid[features], valid[target])
        candidate_best_score = ebm_perf._internal_obj['overall']['auc']
    
        #Determine if current model is better than previous best
        if candidate_best_score > best_score:
            best_candidate = candidate
            best_score = candidate_best_score
            print('Grid search new best score discovered at iteration %d/%d: %.4f.' %
                             (int(i + 1), int(n_models), candidate_best_score))

        print('---------- ----------')
        
        del ebm
            
    return best_candidate

#### Fit EBM with random grid search of 150 models

Grid search has been modified by running a larger `number of models` (n=150), adding 1,024, 2,048 and 3,072 as options for adding `max_bins`, adding 24 and 40 as an option for `max_interaction_bins`, adding 7 as an option for the `max_leaves`, and defining 5 `cross-validation folds` instead of the standard 3.

In [None]:
#Dictionary of hyperparameter value lists for grid search
gs_params = {'max_bins': [128, 256, 512, 1024, 2048],
             'max_interaction_bins': [16, 24, 32, 40, 64],
             'interactions': [5, 10, 15],
             'outer_bags': [4, 8, 12], 
             'inner_bags': [0, 4],
             'learning_rate': [0.001, 0.01, 0.05],
             'validation_size': [0.1, 0.25, 0.5],
             'min_samples_leaf': [1, 2, 5, 10],
             'max_leaves': [1, 3, 5, 7]}

#Start local timer
ebm_tic = time.time()

#EBM grid search
best_ebm = ebm_grid(train, valid, features, target, gs_params=gs_params, n_models=150, 
                    early_stopping_rounds=100, seed=SEED)

#End local timer
ebm_toc = time.time() - ebm_tic
print('EBM training completed in %.2f s.' % (ebm_toc))

Grid search run 1/150:
Training with parameters: {'n_jobs': 4, 'early_stopping_rounds': 100, 'random_state': 12345, 'max_bins': 256, 'max_interaction_bins': 40, 'interactions': 5, 'outer_bags': 4, 'inner_bags': 4, 'learning_rate': 0.01, 'validation_size': 0.5, 'min_samples_leaf': 1, 'max_leaves': 5}
Grid search new best score discovered at iteration 1/150: 0.7810.
---------- ----------
Grid search run 2/150:
Training with parameters: {'n_jobs': 4, 'early_stopping_rounds': 100, 'random_state': 12345, 'max_bins': 128, 'max_interaction_bins': 64, 'interactions': 10, 'outer_bags': 8, 'inner_bags': 4, 'learning_rate': 0.01, 'validation_size': 0.5, 'min_samples_leaf': 2, 'max_leaves': 3}
Grid search new best score discovered at iteration 2/150: 0.7817.
---------- ----------
Grid search run 3/150:
Training with parameters: {'n_jobs': 4, 'early_stopping_rounds': 100, 'random_state': 12345, 'max_bins': 2048, 'max_interaction_bins': 64, 'interactions': 15, 'outer_bags': 12, 'inner_bags': 4, 'lea

#### Computing the Validation AUC for this Model

In [None]:
#Showing the highest validation score found from the grid search conducted
best_ebm_perf = ROC(best_ebm.predict_proba).explain_perf(valid[features], valid[target])
print('Validation AUC: %.4f.' % best_ebm_perf._internal_obj['overall']['auc'])
best_ebm_train = ROC(best_ebm.predict_proba).explain_perf(train[features], train[target])
print('Train AUC: %.4f.' % best_ebm_train._internal_obj['overall']['auc'])

In [None]:
print("Best hyperparameters from grid search:")
for param in gs_params.keys():
    print(f"{param}: {best_ebm.get_params()[param]}")

#### Score Validation with Model

In [182]:
toc = time.time() - tic
print('All tasks completed in %.2f s.' % (toc))

All tasks completed in 36335.79 s.
