# License 
***
Copyright (C) 2017 -- 2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Installs and Imports

1. Standard Python imports

In [None]:
import pandas as pd # import pandas for easy data manipulation using data frames

from matplotlib import pyplot as plt # plotting
import seaborn as sns                # slightly better plotting 

# to upload local files
import io
from google.colab import files  

2. Install Java

In [None]:
# install Java for h2o backend
!apt-get install default-jre
!java -version

3. Install H2O

In [None]:
# install h2o
!pip install h2o 

4. Import h2o package and required classes

In [None]:
# import h2o and required classes
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch

## Load and explore example data

5. Upload class example data

In [None]:
# special google collab command to upload a file from computer
uploaded = files.upload() # REQUIRES STUDENT INPUT

In [None]:
# 6
uploaded.keys() # what is stored in that Python object?

7. Covert to Pandas DataFrame

In [None]:
# convert data to Pandas DataFrame
raw = pd.read_csv(io.StringIO(uploaded['loan_clean.csv'].decode('utf-8'))) # name in quotes here must match name in quotes directly above 

8. Summary and descriptive statistics

In [None]:
# summary and descriptive statistics
raw.describe()

9. View histograms

In [None]:
_ = raw[raw.columns].hist(bins=50, figsize=(15, 15)) # display histograms

10. View correlation heatmap

In [None]:
# correlation heatmap
corr = raw.corr()
plt.figure(figsize=(10, 10))
_ = sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)

## Train penalized GLM model to predict loan default

11. Assign global constants

In [None]:
x_names = [] # REQUIRES STUDENT INPUT
y_name = '' # REQUIRES STUDENT INPUT

12. Start h2o server

In [None]:
# start h2o
h2o.init()

13. Function for penalized GLM training that selects good alpha and lamda

In [None]:
def glm_grid(x_names, y_name, htrain, hvalid=None, seed_=12345):

    """ Wrapper function for penalized GLM with alpha and lambda search.
    :param x_names: List of inputs.
    :param y_name: Name of target variable.
    :param htrain: Training H2OFrame.
    :param hvalid: Validation H2OFrame, default None.
    :param seed_: Random seed for better reproducibility, default 12345.
    :return: Best H2OGeneralizedLinearEstimator.
    """

    alpha_opts = []  # REQUIRES STUDENT INPUT

    # define search criteria
    # i.e., over alpha
    # lamda search handled by lambda_search param below
    hyper_parameters = {'alpha': alpha_opts}

    # initialize grid search
    grid = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family="binomial",
                                      lambda_search=True,
                                      seed=seed_),
        hyper_params=hyper_parameters)

    # execute training w/ grid search
    grid.train(y=y_name,
               x=x_names,
               training_frame=htrain,
               validation_frame=hvalid,
               seed=seed_)

    # select best model from grid search
    best_model = grid.get_grid()[0]
    del grid

    return best_model


14. Convert from Pandas DataFrame to H2OFrame

In [None]:
train = h2o.H2OFrame(raw) # load Pandas DataFrame in H2OFrame
train[y_name] = train[y_name].asfactor() # ensures h2o treats y/target as categorical and not numeric

15. Train model using `glm_grid` function

In [None]:
loan_glm = # REQUIRES STUDENT INPUT

16. Print learned coefficients

In [None]:
# print trained model parameters
print('Model parameters:')
for name, val in loan_glm.coef().items():
    print(name + ':', val)
print()

17. View elastic net trace plot

In [None]:
# view trace plot

# collect regularization paths from dict in DataFrame
reg_path_dict = loan_glm.getGLMRegularizationPath(loan_glm)
reg_path_frame = pd.DataFrame(columns=reg_path_dict['coefficients'][0].keys())
for i in range(0, len(reg_path_dict['coefficients'])): 
    reg_path_frame = reg_path_frame.append(reg_path_dict['coefficients'][i], 
                                           ignore_index=True)

glm_selected = list(reg_path_frame.iloc[-1, :][reg_path_frame.iloc[-1, :].abs() > 0.0001].index)

# plot regularization paths
fig, ax_ = plt.subplots(figsize=(8, 6))
_ = reg_path_frame[glm_selected].plot(kind='line', ax=ax_, title='Penalized GLM Regularization Paths',
                                      colormap='gnuplot')
_ = ax_.set_xlabel('Iteration')
_ = ax_.set_ylabel('Coefficient Value')
_ = plt.legend(bbox_to_anchor=(1.05, 0),
               loc=3, 
               borderaxespad=0.)

18. Assign most important feature for evaluation below

In [None]:
# set most important feature 
most_important = '' # REQUIRES STUDENT INPUT

19. Print AUC statistic

In [None]:
# print AUC
print('AUC: %.4f' % loan_glm.auc())

20. View ten rows of actual vs. predicted

In [None]:
raw['P_bad_loan'] = loan_glm.predict(train)['p1'].as_data_frame() # place prediction into a Pandas Dataframe
raw[[y_name, 'P_bad_loan']].head(n=10) # view actual vs. predicted

21. View actual vs. predicted for most important feature

In [None]:
# display y vs. yhat for visual evaluation
fig, ax_ = plt.subplots(figsize=(8, 6))
_ = raw.plot(kind='scatter', x=most_important, y=y_name, s=0.01, color='blue', ax=ax_)
_ = raw.plot(kind='scatter', x=most_important, y='P_bad_loan', s=0.01, color='orange', ax=ax_)

# add custom legend
from matplotlib.lines import Line2D 
custom_lines = [Line2D([0], [0], color='blue', lw=2),
                Line2D([0], [0], color='orange', lw=2)]
_ = ax_.legend(custom_lines, [y_name, 'P_bad_loan'])

22. Generate a prediction for a new customer

In [None]:
# predict on new data -- REQUIRES STUDENT INPUT
new_row = h2o.H2OFrame({
  "GRP_REP_home_ownership": ,
  "GRP_addr_state": ,
  "GRP_purpose": ,
  "GRP_verification_status": ,
  "STD_IMP_REP_annual_inc": ,
  "STD_IMP_REP_delinq_2yrs": ,
  "STD_IMP_REP_dti": ,
  "STD_IMP_REP_emp_length": ,
  "STD_IMP_REP_int_rate": ,
  "STD_IMP_REP_loan_amnt": ,
  "STD_IMP_REP_longest_credit_lengt": ,
  "STD_IMP_REP_revol_util": ,
  "STD_IMP_REP_term_length": ,
  "STD_IMP_REP_total_acc": 
}) 

# generate prediction -- REQUIRES STUDENT INPUT


23. Shutdown h2o

In [None]:
# shutdown h2o
h2o.cluster().shutdown()