In [1]:
def generate_report(model):
    try:
        validation_metrics = {
                    "MSE": model.model_performance(valid=True).mse(),
                    "RMSE": model.model_performance(valid=True).rmse(),
                    #"LogLoss": model.model_performance(valid=True).logloss(),
                    #"Mean Per-Class Error": model.model_performance(valid=True).mean_per_class_error(),
                    "AUC":model.model_performance(valid=True).auc(),
                    "PR_AUC": model.model_performance(valid=True).pr_auc(),
                    #"Gini": model.model_performance(valid=True).gini()
                }
    except Exception as e:
        validation_metrics = {}

    try:
        xval_metrics= {
                    "MSE": model.model_performance(xval=True).mse(),
                    "RMSE": model.model_performance(xval=True).rmse(),
                    #"LogLoss": model.model_performance(xval=True).logloss(),
                    #"Mean Per-Class Error": model.model_performance(xval=True).mean_per_class_error(),
                    "AUC":model.model_performance(xval=True).auc(),
                    "PR_AUC": model.model_performance(xval=True).pr_auc(),
                    #"Gini": model.model_performance(xval=True).gini() 
        }
    except Exception as e:
        xval_metrics = {}
    
    return {
        'model_name': model.model_id,
        #'gains_lift': json.loads(model.gains_lift(xval=True).as_data_frame().to_json(orient="table",index=False))['data'],
        'training_metrics':{
                            "MSE": model.model_performance().mse(),
                            "RMSE": model.model_performance().rmse(),
                            #"LogLoss": model.model_performance().logloss(),
                            #"Mean Per-Class Error": model.model_performance().mean_per_class_error(),
                            "AUC":model.model_performance().auc(),
                            "PR_AUC": model.model_performance().pr_auc(),
                            #"Gini": model.model_performance().gini()
                        },
        'validation_metrics':validation_metrics,
        'xval_metrics':xval_metrics
    }

In [2]:
import h2o

In [3]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_172"; Java(TM) SE Runtime Environment (build 1.8.0_172-b11); Java HotSpot(TM) 64-Bit Server VM (build 25.172-b11, mixed mode)
  Starting server from /anaconda2/envs/spike_basicoV3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/3z/5qxln1rj4zjblz0l0hvy0n540000gn/T/tmpeq_6sr5a
  JVM stdout: /var/folders/3z/5qxln1rj4zjblz0l0hvy0n540000gn/T/tmpeq_6sr5a/h2o_maravenag_started_from_python.out
  JVM stderr: /var/folders/3z/5qxln1rj4zjblz0l0hvy0n540000gn/T/tmpeq_6sr5a/h2o_maravenag_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Santiago
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.4
H2O cluster version age:,4 months and 14 days !!!
H2O cluster name:,H2O_from_python_maravenag_1iqg9k
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0


In [4]:
train = h2o.upload_file("data/train.csv", destination_frame="train")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
data = h2o.upload_file("data/train.csv", destination_frame="data")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
train = train.drop("PassengerId")
train = train.drop("Name")
train = train.drop("Cabin")
train = train.drop("Ticket")

In [7]:
train.head()

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
1,3,female,26.0,0,0,7.925,S
1,1,female,35.0,1,0,53.1,S
0,3,male,35.0,0,0,8.05,S
0,3,male,,0,0,8.4583,Q
0,1,male,54.0,0,0,51.8625,S
0,3,male,2.0,3,1,21.075,S
1,3,female,27.0,0,2,11.1333,S
1,2,female,14.0,1,0,30.0708,C




In [8]:
train['Survived'] = train['Survived'].asfactor()
train['Pclass'] = train['Pclass'].asfactor()
train['Sex'] = train['Sex'].asfactor()
train['Embarked'] = train['Embarked'].asfactor()

In [9]:
train, test = train.split_frame(ratios=[0.7])

In [10]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [11]:
model = H2OGradientBoostingEstimator(nfolds=5,seed = 1234)

In [12]:
train_cols = [x for x in train.col_names if x not in ['Survived']]

In [13]:
model.train(x=train_cols,y="Survived",training_frame=train, validation_frame=test)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [14]:
from sklearn.utils import compute_class_weight

In [15]:
compute_class_weight(class_weight="balanced",y=data['Survived'].as_data_frame().values.flatten(),classes=[0,1])

array([0.81147541, 1.30263158])

In [16]:
import numpy as np

In [17]:
train = train.as_data_frame()
test = test.as_data_frame()

In [18]:
train['weights'] = train.apply(lambda x: 0.81147541 if x['Survived'] == 0 else 1.30263158, axis=1)
test['weights'] = test.apply(lambda x: 0.81147541 if x['Survived'] == 0 else 1.30263158, axis=1)

In [19]:
train = h2o.H2OFrame(train, destination_frame="train")
test = h2o.H2OFrame(test, destination_frame="test")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [20]:
test['Survived'] = test['Survived'].asfactor()
test['Pclass'] = test['Pclass'].asfactor()
test['Sex'] = test['Sex'].asfactor()
test['Embarked'] = test['Embarked'].asfactor()

train['Survived'] = train['Survived'].asfactor()
train['Pclass'] = train['Pclass'].asfactor()
train['Sex'] = train['Sex'].asfactor()
train['Embarked'] = train['Embarked'].asfactor()

In [21]:
train.head(4)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,weights
0,3,male,22,1,0,7.25,S,0.811475
1,1,female,38,1,0,71.2833,C,1.30263
1,3,female,26,0,0,7.925,S,1.30263
1,1,female,35,1,0,53.1,S,1.30263




In [22]:
model_ = H2OGradientBoostingEstimator(nfolds=5,seed = 1234)

In [23]:
model_.train(x=train_cols,y="Survived",training_frame=train, weights_column="weights", validation_frame=test)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [24]:
generate_report(model)

{'model_name': 'GBM_model_python_1561905836437_1',
 'training_metrics': {'MSE': 0.07778793433823798,
  'RMSE': 0.2789048840343926,
  'AUC': 0.9600408149830486,
  'PR_AUC': 0.9484224837169406},
 'validation_metrics': {'MSE': 0.14460516875484294,
  'RMSE': 0.38026986306417043,
  'AUC': 0.8332456140350877,
  'PR_AUC': 0.7581140750212818},
 'xval_metrics': {'MSE': 0.13499045456018224,
  'RMSE': 0.36741047148956196,
  'AUC': 0.863730621111879,
  'PR_AUC': 0.8425867886396248}}

In [25]:
generate_report(model_)

{'model_name': 'GBM_model_python_1561905836437_620',
 'training_metrics': {'MSE': 0.08201718439371315,
  'RMSE': 0.28638642494663247,
  'AUC': 0.9617469251615622,
  'PR_AUC': 0.9638140981323172},
 'validation_metrics': {'MSE': 0.15951110833172838,
  'RMSE': 0.3993884178737891,
  'AUC': 0.8352339181286561,
  'PR_AUC': 0.8337328854958274},
 'xval_metrics': {'MSE': 0.1439616185756131,
  'RMSE': 0.3794227438828794,
  'AUC': 0.8628254501168476,
  'PR_AUC': 0.8878867513945812}}