1. Initialize the basic model and target function

In [2]:
%matplotlib widget
#coding=utf8
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from skopt import gp_minimize
import pandas as pd
from skopt.plots import *
import math
from scipy.interpolate import SmoothBivariateSpline

'''initialized the h2o JVM'''
h2o.init(min_mem_size=12)

'''import data and define the basic distribution'''
# Have to change model name only
model_name = 'tp_bi_S'

predict_data = h2o.import_file(model_name + '.csv', 
                               col_types=["categorical", "categorical", "numeric", "numeric", "numeric"])
if model_name.find('F') != -1:
    basic_dis = 'Poisson'
else:
    basic_dis = 'Gamma'
    
e_data = predict_data.group_by(by=predict_data.columns[0]).sum(predict_data.columns[2])
e_data = e_data.get_frame().as_data_frame(use_pandas=True)
e_data = e_data.rename(columns={predict_data.columns[0]: 'levels', 'sum_'+predict_data.columns[2]: 'w'})
predict_f = predict_data.columns[:2]

'''Can not put the asfactor function into the customized function,
or it will cause the overused of GC memory'''
predict_data[predict_f] = predict_data[predict_f].asfactor()

'''define the model'''
def GLM_model_basic(data,family_dis, alp, lamb):
    glm_model = H2OGeneralizedLinearEstimator(
        nfolds=5,
        seed=1234,
        family=family_dis,
        offset_column=data.columns[4],
        weights_column=data.columns[2],
        link='Log',
        intercept=False,
        alpha=alp,
        lambda_=lamb
    )
    glm_model.train(predict_f, data.columns[3], training_frame=data)
    return glm_model

'''define the target function'''
def GLM_train_func(X):
    glm_model = GLM_model_basic(predict_data, basic_dis, X[0], 10**-X[1])
    r = glm_model.mean_residual_deviance(xval=True)
    h2o.remove(glm_model)
    return r



Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,32 secs
H2O_cluster_timezone:,Asia/Shanghai
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.34.0.1
H2O_cluster_version_age:,10 days
H2O_cluster_name:,H2O_from_python_chenxuyi_s6hz0m
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,12 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


2. Get the best parameters

In [3]:
'''parameter space and basic distribution'''
params_space = [(0.0, 1.0),
                (0.0, 10.0)]

'''bayesian optimization: finding the best parameters'''
best_params = gp_minimize(GLM_train_func, params_space, random_state=1234)

glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
glm Model Build progress: |███████████████████████

3. Plot or print the resaults

In [4]:
plot_convergence(best_params)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:title={'center':'Convergence plot'}, xlabel='Number of calls $n$', ylabel='$\\min f(x)$ after $n$ calls'>

In [5]:
plot_objective(best_params, size=4, dimensions=['weight of L2 regularization', 'strength of regularization'])

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<AxesSubplot:xlabel='weight of L2 regularization', ylabel='Partial dependence'>,
        <AxesSubplot:>],
       [<AxesSubplot:xlabel='weight of L2 regularization', ylabel='strength of regularization'>,
        <AxesSubplot:xlabel='strength of regularization', ylabel='Partial dependence'>]],
      dtype=object)

In [6]:
plot_evaluations(best_params)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

array([[<AxesSubplot:xlabel='$X_{0}$', ylabel='Number of samples'>,
        <AxesSubplot:>],
       [<AxesSubplot:xlabel='$X_{0}$', ylabel='$X_{1}$'>,
        <AxesSubplot:xlabel='$X_{1}$', ylabel='Number of samples'>]],
      dtype=object)

In [7]:
print(best_params.x)
glm_final = GLM_model_basic(predict_data, basic_dis, best_params.x[0], 10**-best_params.x[1])

[0.0, 1.744264715604529]
glm Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


4. get the best model and output the coefficients

In [8]:
coef = pd.DataFrame.from_dict(glm_final.coef(), orient='index',columns=['values'])
coef = coef.reset_index().rename(columns={'index':'levels'})
coef['values'] = coef['values'].apply(lambda x: math.exp(x))
coef.to_csv(model_name+'_f_values.csv', index=0)
coef_interplt = coef[(coef['levels'].str.contains('dpt')) & (np.round(coef['values'],6)!=1)].copy()
coef_interplt['levels'] = coef_interplt['levels'].apply(lambda x: x[7:])
coef_interplt = coef_interplt.merge(e_data, how='inner', on='levels')
coef_interplt.to_csv(model_name+'_pltdata.csv', index=0)

In [9]:
h2o.cluster().shutdown()

H2O session _sid_8ba6 closed.


In [10]:
coef_interplt

Unnamed: 0,levels,values,w
0,90129,0.990206,1
1,90131,1.019121,4
2,90150,1.089424,4
3,90162,1.005518,1
4,90163,1.035011,3
...,...,...,...
189,950EA,1.011112,9
190,95181,0.959882,5
191,95183,0.986227,5
192,951EA,1.022440,11
