In [1]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,3 hours 20 mins
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.42.0.2
H2O_cluster_version_age:,4 months and 15 days
H2O_cluster_name:,H2O_from_python_MAHDAVIM_a5wz5b
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.284 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [2]:
import pandas as pd
data = pd.read_csv("SBA_loans_project_2_holdout_students_valid.csv")

In [3]:
def project_2_scoring(data):

    # Import Libraries-----------------------------------
    import pandas as pd
    import numpy as np
    import pickle
    import math

    from h2o.estimators import H2OTargetEncoderEstimator
    from copy import deepcopy
   
    X = data.copy()
    
    # Data Cleaning--------------------------------------
    #'LowDoc' Feature
    unique_entries = X['LowDoc'].unique()
    LowDoc_error = [entry for entry in unique_entries if entry not in ['Y', 'N']]
    error_or_missing_mask = X['LowDoc'].isin(LowDoc_error) | X['LowDoc'].isna()
    X.loc[error_or_missing_mask & (X['DisbursementGross'] > 150000), 'LowDoc'] = 'N'
    X.loc[error_or_missing_mask & (X['DisbursementGross'] <= 150000), 'LowDoc'] = 'Y'
    
    #'RevLineCr' Feature
    RevLineCr_error = X['RevLineCr'][~X['RevLineCr'].isin(['Y', 'N'])].unique()
    X['RevLineCr'].replace(RevLineCr_error, 'Missing', inplace=True)
    
    #'NewExist' Feature
    X['NewExist'].replace({1.0: 'Existing', 2.0: 'New', 0.0: 'Missing'}, inplace=True)
    
    #'UrbanRural' Feature
    X['UrbanRural'].replace({1.0: 'Urban', 2.0: 'Rural', 0.0: 'Undefined'}, inplace=True)
    
    # Filling Missing Values---------------------------------
    # Fill missing values for categorical features with 'Missing'
    cat_cols = X.select_dtypes(include=['object']).columns
    cat_cols_missing = cat_cols[X[cat_cols].isnull().any()]
    X[cat_cols_missing] = X[cat_cols_missing].fillna('Missing')

    # Fill missing values for numerical features with their mean
    num_cols = X.select_dtypes(include=[np.number]).columns
    num_cols_missing = num_cols[X[num_cols].isnull().any()]
    for col in num_cols_missing:
        X[col] = X[col].fillna(X[col].mean())
    
    # Feature Engineering-------------------------------------
    # Create 'Region' Feature
    def custom_zipcode_binning(zipcode_str):
        zipcode_str = str(zipcode_str)
        region = int(zipcode_str[0])
        if region in [0, 1, 2]:
            return 'Northeast'
        elif region in [3, 4, 5]:
            return 'Midwest'
        elif region in [6, 7]:
            return 'South'
        elif region in [8, 9]:
            return 'West'
        else:
            return 'Unknown'

    X['Region'] = X['Zip'].apply(custom_zipcode_binning)
    
    # Create 'NAICS' Features
    X['NAICS_sector'] = X['NAICS'].apply(lambda x: str(x)[:2])
    X['NAICS_subsector'] = X['NAICS'].apply(lambda x: str(x)[:3])
    X['NAICS_industry_group'] = X['NAICS'].apply(lambda x: str(x)[:4]) 
    
    # Create 'NAICS' Features
    X['SBA_Guarantee_Ratio'] = X['SBA_Appv'] / X['GrAppv']
       
    # Create 'LoanSizeCategory' Feature
    quantiles = X['DisbursementGross'].quantile([0.33, 0.66])
    X['LoanSizeCategory'] = pd.cut(X['DisbursementGross'],
                                   bins=[-np.inf, quantiles[0.33], quantiles[0.66], np.inf],
                                   labels=['Small', 'Medium', 'Large'])
    X['LoanSizeCategory'] = X['LoanSizeCategory'].astype(str)
    
    # Square Root Transformations
    X['sqrt_DisbursementGross'] = np.sqrt(X['DisbursementGross'])
    X['sqrt_GrAppv'] = np.sqrt(X['GrAppv'])
    X['sqrt_SBA_Appv'] = np.sqrt(X['SBA_Appv'])
    
    # Exponential Transformations
    X['exp_NoEmp'] = np.exp(X['NoEmp'])
    
    # Interaction Terms
    X['NoEmp_RetainedJob'] = X['NoEmp'] * X['RetainedJob']
    
    # H2O Data Processing-----------------------------
    X_h2o = h2o.H2OFrame(X)
    
    column_types = X_h2o.types
    cat_cols = [col for col in column_types if column_types[col] == 'enum']
    X_h2o[cat_cols] = X_h2o[cat_cols].asfactor()

    predictors = X_h2o.columns
    predictors.remove('index')
    
    # GBM Model---------------------------------------
    modelfile = "GBM_model_python_1702227374224_18.zip"
    best_model = h2o.import_mojo(modelfile)   
  
    y_pred_h2o = best_model.predict(X_h2o[predictors])

    y_pred_df = y_pred_h2o.as_data_frame()
    probability_0 = y_pred_df['p0']  # Probability of class 0
    probability_1 = y_pred_df['p1']  # Probability of class 1

    threshold = 0.30616414255568053
    y_pred = (probability_1 >= threshold).astype(np.int16)

    probability_0 = probability_0.round(2)
    probability_1 = probability_1.round(2)

    d = {
        "index": data["index"],
        "label": y_pred,
        "probability_0": probability_0,
        "probability_1": probability_1
}
    return pd.DataFrame(d)

In [4]:
project_2_scoring(data).head(5)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%




Unnamed: 0,index,label,probability_0,probability_1
0,0,0,0.97,0.03
1,1,0,0.8,0.2
2,2,0,0.99,0.01
3,3,0,0.88,0.12
4,4,0,0.93,0.07
