<a href="https://colab.research.google.com/github/jphall663/GWU_ML/blob/main/notebook/lecture_10/Assignment_10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# License 
***
Copyright (C) 2017-2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

***
# Kaggle House Prices with GLRM Matrix Factorization Example

1. General imports and inits

In [None]:
import matplotlib as plt
%matplotlib inline
import numpy as np
import pandas as pd

# to upload local files
import io
from google.colab import files  

SEED = 12345 # for better reproducibility

2. Install Java for h2o 

In [None]:
# install Java for h2o backend
!apt-get install default-jre
!java -version

3. Install h2o

In [None]:
# install h2o
!pip install h2o 

4. Import and start h2o

In [None]:
import h2o
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch 
h2o.init()

## Helper Functions

5. Determine data types

In [None]:
def get_type_lists(frame, rejects=['Id', 'SalePrice']):

    """Creates lists of numeric and categorical variables.
    
    :param frame: The frame from which to determine types.
    :param rejects: Variable names not to be included in returned lists.
    :return: Tuple of lists for numeric and categorical variables in the frame.
    
    """
    
    nums, cats = [], []
    for key, val in frame.types.items():
        if key not in rejects:
            if val == 'enum':
                cats.append(key)
            else: 
                nums.append(key)
                
    print('Numeric =', nums)                
    print()
    print('Categorical =', cats)
    
    return nums, cats

6. Impute with GLRM

In [None]:
def glrm_num_impute(role, frame):

    """ Helper function for imputing numeric variables using GLRM.
    
    :param role: Role of frame to be imputed.
    :param frame: H2OFrame to be imputed.
    :return: H2OFrame of imputed numeric features.
    
    """
    
    # count missing values in training data numeric columns
    print(role + ' missing:\n', [cnt for cnt in frame.nacnt() if cnt != 0.0])

    # initialize GLRM
    matrix_complete_glrm = H2OGeneralizedLowRankEstimator(
        k=,                      # create 10 features, REQUIRES STUDENT INPUT 
        transform='STANDARDIZE', # <- seems very important
        gamma_x=0.001,           # regularization on values in X
        gamma_y=0.05,            # regularization on values in Y
        seed=,                   # REQUIRES STUDENT INPUT
        impute_original=True)

    # train GLRM
    matrix_complete_glrm.train(training_frame=frame, x=original_nums)

    # plot iteration history to ensure convergence
    matrix_complete_glrm.score_history().plot(x='iterations', y='objective', title='GLRM Score History')

    # impute numeric inputs by multiplying the calculated xi and yj for the missing values in train
    num_impute = matrix_complete_glrm.predict(frame)

    # count missing values in imputed set
    print('imputed ' + role + ' missing:\n', [cnt for cnt in num_impute.nacnt() if cnt != 0.0])
    
    return num_impute

7. Embed with GLRM 

In [None]:
def glrm_cat_embed(frame):
    
    """ Helper function for embedding caetgorical variables using GLRM.
    
    :param frame: H2OFrame to be embedded.
    :return: H2OFrame of embedded categorical features.
    
    """
    
    # initialize GLRM
    cat_embed_glrm = H2OGeneralizedLowRankEstimator(
        k=, # REQUIRES STUDENT INPUT
        transform='STANDARDIZE',
        loss='Quadratic',
        regularization_x='Quadratic',
        regularization_y='L1',
        gamma_x=0.25,
        gamma_y=0.5,
        seed=) # REQUIRES STUDENT INPUT

    # train GLRM
    cat_embed_glrm.train(training_frame=frame, x=cats)

    # plot iteration history to ensure convergence
    cat_embed_glrm.score_history().plot(x='iterations', y='objective', title='GLRM Score History')

    # extracted embedded features
    cat_embed = h2o.get_frame(cat_embed_glrm._model_json['output']['representation_name'])
    
    return cat_embed

8. Import train data

In [None]:
# special google collab command to upload a file from computer
uploaded = files.upload() # REQUIRES STUDENT INPUT

In [None]:
# 9 
uploaded.keys() # what is stored in that Python object?

10. Load train and test data

In [None]:
train = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8'))) # name in quotes here must match name in 6 above
train = h2o.H2OFrame(train) # convert to h2o

print(train.shape)

11. Determine input feature measurement levels

In [None]:
original_nums, cats = get_type_lists(train)

12. Split into to train and validation (before doing data prep!!!)

In [None]:
train, valid = train.split_frame([0.7], seed=) # REQUIRES STUDENT INPUT
print(train.shape)
print(valid.shape)

13. Impute numeric missing using GLRM matrix completion in training data

In [None]:
train_num_impute = glrm_num_impute('training', train)

14. Inspect imputed training data 

In [None]:
train_num_impute.head()

15. Impute numeric missing using GLRM matrix completion in validation data

In [None]:
valid_num_impute = glrm_num_impute('validation', valid)

16. Embed categorical vars in training data with GLRM

In [None]:
train_cat_embed = glrm_cat_embed(train) # runs for a long time

17. Embed categorical vars in validation data with GLRM

In [None]:
valid_cat_embed = glrm_cat_embed(valid) # runs for a long time

18. Merge imputed and embedded frames

In [None]:
imputed_embedded_train = train[['Id', 'SalePrice']].cbind(train_num_impute).cbind(train_cat_embed)
imputed_embedded_valid = valid[['Id', 'SalePrice']].cbind(valid_num_impute).cbind(valid_cat_embed)

19. Redefine numeric inputs and explore

In [None]:
imputed_embedded_nums, cats = get_type_lists(imputed_embedded_train)

print('Imputed and encoded numeric training data:')
imputed_embedded_train.describe() 
print('--------------------------------------------------------------------------------')
print('Imputed and encoded numeric validation data:')
imputed_embedded_valid.describe() 


20. Log transform for supervised target

In [None]:
# Check log transform - looks good
%matplotlib inline
imputed_embedded_train['SalePrice'].log().as_data_frame().hist()

# Execute log transform
imputed_embedded_train['SalePrice'] = imputed_embedded_train['SalePrice'].log()
imputed_embedded_valid['SalePrice'] = imputed_embedded_valid['SalePrice'].log()
print(imputed_embedded_train[0:3, 'SalePrice'])

21. Train GLM on imputed, embedded inputs

In [None]:
alpha_opts = [0.01, 0.25, 0.5, 0.99] # always keep some L2
hyper_parameters = {"alpha":alpha_opts}

# initialize grid search
grid = H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family="gaussian",
        lambda_search=True,
        seed=), # REQUIRES STUDENT INPUT
        hyper_params=hyper_parameters)
    
# train grid
grid.train(y='SalePrice',
           x=imputed_embedded_nums, 
           training_frame=imputed_embedded_train,
           validation_frame=imputed_embedded_valid,
           seed=) # REQUIRES STUDENT INPUT

# show grid search results
print(grid.show())

best = grid.get_grid()[0]
print(best)
    
# plot top frame values
yhat_frame = imputed_embedded_valid.cbind(best.predict(imputed_embedded_valid))
print(yhat_frame[0:10, ['SalePrice', 'predict']])

# plot sorted predictions
yhat_frame_df = yhat_frame[['SalePrice', 'predict']].as_data_frame()
yhat_frame_df.sort_values(by='predict', inplace=True)
yhat_frame_df.reset_index(inplace=True, drop=True)
_ = yhat_frame_df.plot(title='Ranked Predictions Plot')

In [None]:
# 22
# Shutdown H2O - this will erase all your unsaved frames and models in H2O
h2o.cluster().shutdown(prompt=True)