# AutoML

h2o 是一个 AutoML 框架，本节我们尝试用 `h2o.automl.H2OAutoML` 完成 Kaggle 竞赛 [titanic](https://www.kaggle.com/competitions/titanic)

h2o doc: [docs.h2o.ai](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html)

In [1]:
# !pip install h2o

In [2]:
DIRECTORY = './data'
TRAIN_FILE='titanic/train.csv'
TEST_FILE='titanic/test.csv'
MODEL_FILE='model'
PREDICT_FILE='res.csv'

LABEL_COL='Survived'

In [3]:
import h2o
import pandas as pd
import warnings

import util

In [4]:
# 隐藏 warning，请谨慎使用
warnings.filterwarnings("ignore")

## 一、训练

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 day 11 hours 46 mins
H2O_cluster_timezone:,Asia/Shanghai
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 8 days
H2O_cluster_name:,H2O_from_python_changluo_rxywi6
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.124 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [6]:
train_path = util.gen_abspath(DIRECTORY, TRAIN_FILE)
data = h2o.import_file(train_path)
data

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803.0,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450.0,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877.0,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463.0,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909.0,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742.0,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736.0,30.0708,,C


In [7]:
# 处理类别变量
factor_cols = ['Name', 'Sex']
for col in factor_cols:
    data[col] = data[col].asfactor()

In [8]:
# 分割训练集、验证集
train, valid = data.split_frame(ratios=[0.8], seed=377)

In [9]:
y = LABEL_COL
x = data.columns
x.remove('PassengerId')
x.remove('Survived')

x, y

(['Pclass',
  'Name',
  'Sex',
  'Age',
  'SibSp',
  'Parch',
  'Ticket',
  'Fare',
  'Cabin',
  'Embarked'],
 'Survived')

In [10]:
aml = h2o.automl.H2OAutoML(max_runtime_secs=1000)
aml.train(x=x, y=y, training_frame=data)

AutoML progress: |
15:45:05.82: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.


15:45:06.459: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

███
15:45:10.71: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
15:45:11.320: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.
15:45:11.578: _response param, We have detected 

key,value
Stacking strategy,cross_validation
Number of base models (used / total),2/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DeepLearning base models (used / total),0/1
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,135.9343,29.51469,134.29366,162.11841,159.24384,88.47188,135.54369
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.2464139,0.0210543,0.2487044,0.2568523,0.2741682,0.2202914,0.2320531
mean_residual_deviance,0.1206768,0.0199436,0.1191196,0.1344411,0.1440469,0.0923984,0.1133779
mse,0.1206768,0.0199436,0.1191196,0.1344411,0.1440469,0.0923984,0.1133779
null_deviance,42.327534,3.847708,42.58343,41.40595,38.167164,40.909733,48.571392
r2,0.482384,0.1047366,0.5080857,0.3950149,0.3574236,0.6114406,0.539955
residual_deviance,21.484598,3.4975882,20.845922,24.602726,24.199884,15.892531,21.881931
rmse,0.3464043,0.0291722,0.345137,0.3666621,0.3795352,0.3039711,0.3367163
rmsle,0.2447336,0.0217436,0.2419409,0.2645869,0.2682852,0.2170168,0.2318382


In [11]:
# 预测
y_pred = aml.predict(data).as_data_frame()['predict'].tolist()
y_true = data.as_data_frame()['Survived'].tolist()

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [12]:
y_label, threshold = util.eval_binary(y_true=y_true, y_pred=y_pred, n_trials=1000, ret=True)

threshold: 0.69676
accuracy: 1.00000
precision: 1.00000
recall: 1.00000
f1_score: 1.00000
auc: 1.00000
cross-entropy loss: 0.06207
True Positive (TP): 342
True Negative (TN): 549
False Positive (FP): 0
False Negative (FN): 0
confusion matrix:
[[549   0]
 [  0 342]]


## 二、评估

In [13]:
# 最佳模型的表现
aml.leader.model_performance(valid)

In [14]:
# 最佳模型的摘要信息
aml.leader.summary()

key,value
Stacking strategy,cross_validation
Number of base models (used / total),2/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DeepLearning base models (used / total),0/1
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5


In [15]:
# 最佳模型模型的超参
# aml.leader.params

In [16]:
# 特征的重要程度
aml.varimp()

Unnamed: 0,XGBoost_grid_1_model_10,GBM_grid_1_model_327,GBM_grid_1_model_698,GBM_grid_1_model_273,GBM_grid_1_model_410,GBM_grid_1_model_445,GBM_grid_1_model_65,GBM_grid_1_model_725,GBM_grid_1_model_162,GBM_grid_1_model_296,...,XGBoost_grid_1_model_3,XGBoost_grid_1_model_8,XGBoost_2,XGBoost_grid_1_model_9,XGBoost_grid_1_model_12,XGBoost_grid_1_model_4,XGBoost_grid_1_model_11,XGBoost_1,XGBoost_grid_1_model_6,XGBoost_grid_1_model_14
Sex,0.27402,0.055336,0.167896,0.174546,0.192084,0.139478,0.11482,0.120747,0.085039,0.196894,...,0.375306,0.395227,0.248592,0.260858,0.308589,0.343556,0.353041,0.26588,0.300947,0.464789
Pclass,0.071126,0.05293,0.065262,0.02233,0.022543,0.026175,0.03553,0.030586,0.026582,0.015224,...,0.094034,0.103691,0.061008,0.065278,0.087692,0.06808,0.087488,0.074655,0.068421,0.076083
Parch,0.012606,0.020539,0.01935,0.016996,0.017348,0.015384,0.021776,0.015559,0.013475,0.014495,...,0.008627,0.009027,0.028674,0.016126,0.015828,0.01242,0.011771,0.01792,0.01003,0.003753
Embarked,0.007831,0.00965,0.013258,0.016902,0.01003,0.013769,0.012371,0.012753,0.013529,0.006914,...,0.008088,0.018824,0.026718,0.035327,0.024494,0.016619,0.004408,0.013259,0.010179,0.015367
SibSp,0.031246,0.021589,0.035358,0.018349,0.022884,0.025749,0.041054,0.007859,0.023534,0.013164,...,0.040311,0.032102,0.028212,0.049566,0.050852,0.031455,0.024291,0.035775,0.03594,0.03042
Fare,0.0831,0.143482,0.109749,0.102534,0.144883,0.028349,0.095471,0.079031,0.08048,0.017286,...,0.140255,0.119274,0.197073,0.148737,0.172044,0.151814,0.125052,0.168578,0.157886,0.072385
Ticket,0.089689,0.039212,0.053695,0.089826,0.071886,0.076336,0.043382,0.078644,0.108573,0.020368,...,0.148888,0.145364,0.217301,0.16277,0.158795,0.158004,0.141738,0.19789,0.187044,0.101958
Age,0.101773,0.142185,0.131816,0.069803,0.039925,0.098367,0.123357,0.113265,0.08405,0.028918,...,0.152985,0.138154,0.172378,0.216895,0.161625,0.196455,0.204874,0.213198,0.199083,0.113254
Cabin,0.050968,0.101156,0.063952,0.029271,0.027464,0.165237,0.130459,0.132807,0.082497,0.228421,...,0.031506,0.038336,0.020043,0.044443,0.020079,0.021596,0.047337,0.012845,0.030469,0.033742
Name,0.277642,0.413921,0.339662,0.459444,0.450953,0.411156,0.381781,0.408749,0.482242,0.458318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.088248


In [17]:
# 模型排行榜
aml.leaderboard

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_4_AutoML_2_20240421_154505,0.347228,0.120567,0.246114,0.245318,0.120567
StackedEnsemble_BestOfFamily_6_AutoML_2_20240421_154505,0.347449,0.120721,0.24656,0.244984,0.120721
StackedEnsemble_AllModels_6_AutoML_2_20240421_154505,0.350281,0.122697,0.265029,0.24724,0.122697
StackedEnsemble_AllModels_2_AutoML_2_20240421_154505,0.352403,0.124188,0.251904,0.249123,0.124188
StackedEnsemble_AllModels_4_AutoML_2_20240421_154505,0.352614,0.124337,0.275417,0.248981,0.124337
XGBoost_lr_search_selection_AutoML_2_20240421_154505_select_grid_model_5,0.352847,0.124501,0.261726,0.248529,0.124501
StackedEnsemble_AllModels_3_AutoML_2_20240421_154505,0.353049,0.124644,0.276233,0.249569,0.124644
StackedEnsemble_AllModels_1_AutoML_2_20240421_154505,0.353995,0.125313,0.257357,0.249851,0.125313
StackedEnsemble_BestOfFamily_3_AutoML_2_20240421_154505,0.355457,0.12635,0.256064,0.250785,0.12635
XGBoost_grid_1_AutoML_2_20240421_154505_model_14,0.357534,0.12783,0.251429,0.252759,0.12783


In [18]:
# dir(aml.leader)

## 三、保存

In [19]:
# save model DIRECTORY
model_dir = util.gen_abspath(DIRECTORY, MODEL_FILE)
model_path = h2o.save_model(model=aml.leader, path=model_dir, force=True)

In [20]:
# load model
saved_model = h2o.load_model(model_path)

## 四、预测

In [21]:
test_path = util.gen_abspath(DIRECTORY, TEST_FILE)
test_data = h2o.import_file(test_path)
test_data

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
892,3,"Kelly, Mr. James",male,34.5,0,0,330911.0,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272.0,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276.0,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154.0,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101300.0,12.2875,,S
897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538.0,9.225,,S
898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972.0,7.6292,,Q
899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738.0,29.0,,S
900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657.0,7.2292,,C
901,3,"Davies, Mr. John Samuel",male,21.0,2,0,,24.15,,S


In [22]:
factor_cols = ['Name', 'Sex']
for col in factor_cols:
    test_data[col] = test_data[col].asfactor()
tmp = test_data.drop(['PassengerId'], axis=1)

df_predict = aml.predict(tmp)
# df_predict

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [23]:
res = pd.DataFrame({
    'PassengerId': test_data.as_data_frame()['PassengerId'].tolist(),
    'Survived': [1 if e > threshold else 0 for e in df_predict.as_data_frame()['predict'].tolist()]
})

res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [24]:
res_path = util.gen_abspath(DIRECTORY, PREDICT_FILE)
res.to_csv(res_path, index=False)

In [25]:
# res_path = util.gen_abspath(DIRECTORY, PREDICT_FILE)
# h2o.export_file(res, res_path, force=True)