# H2O workflow

## Imports

In [3]:
import sys
import os
sys.path.append(os.path.split(os.path.split(os.getcwd())[0])[0])
config_filepath = os.path.join(os.getcwd(),"config/fit_config_h2o.json")
notebook_filepath = os.path.join(os.getcwd(),"fit.ipynb")
import uuid
import uuid
import json
import datetime
import getpass

from mercury_ml.common import tasks
from mercury_ml.common import utils
from mercury_ml.common import containers as common_containers
from mercury_ml.h2o import containers as h2o_containers

In [4]:
#For testing purposes only!

if os.path.isdir("./example_results"):
    import shutil
    shutil.rmtree("./example_results")

## Helpers

These functions will help with the flow of this particular notebook

In [5]:
def print_data_bunch(data_bunch):

    for data_set_name, data_set in data_bunch.__dict__.items():
        print("{} <{}>".format(data_set_name, type(data_set).__name__))
        for data_wrapper_name, data_wrapper in data_set.__dict__.items():
            print("  {} <{}>".format(data_wrapper_name, type(data_wrapper).__name__))
        print()
        
def maybe_transform(data_bunch, pre_execution_parameters):
    if pre_execution_parameters:
        return data_bunch.transform(**pre_execution_parameters)
    else:
        return data_bunch
        
def print_dict(d):
    print(json.dumps(d, indent=2))

def get_installed_packages():
    import pip
    try:
        from pip._internal.operations import freeze
    except ImportError:  # pip < 10.0
        from pip.operations import freeze

    packages = []
    for p in freeze.freeze():
        packages.append(p)

    return packages

## Config

#### Load config

In [6]:
config = utils.load_referenced_json_config(config_filepath)

In [7]:
print_dict(config)

{
  "meta_info": {
    "ml_engine": "h2o",
    "model_purpose": "test_h2o",
    "session_id": "{session_id}",
    "model_object_name": "{model_purpose}__{session_id}",
    "data_bunch_name": "array_789",
    "notebook_filepath": "{notebook_filepath}",
    "config_filepath": "{config_filepath}"
  },
  "init": {
    "initiate_session": {
      "name": "get_or_create_h2o"
    },
    "read_source_data": {
      "name": "read_disk_pandas"
    },
    "model_definition": {
      "name": "rf",
      "params": {}
    },
    "fit": {
      "name": "fit"
    },
    "save_model": {
      "names": [
        "save_h2o_model",
        "save_json_details",
        "save_pojo",
        "save_mojo",
        "save_pojo_jar",
        "save_mojo_jar"
      ]
    },
    "copy_from_local_to_remote": {
      "name": "copy_from_disk_to_disk"
    },
    "evaluate": {
      "name": "evaluate"
    },
    "evaluate_threshold_metrics": {
      "name": "evaluate_threshold_metrics"
    },
    "predict": {
      "name

}


#### Set model_id

In [8]:
session_id = str(uuid.uuid4().hex)

In [9]:
print(session_id)

818c0c6c1c404dddae908823f0d89946


#### Update config

The function `utils.recursively_update_config(config, string_formatting_dict)` allows us to use string formatting to replace placeholder strings with acctual values.

for example: 

```python
>>> config = {"some_value": "some_string_{some_placeholder}"}
>>> string_formatting_dict = {"some_placeholder": "ABC"}
>>> utils.recursively_update_config(config, string_formatting_dict)
>>> print(config)
{"some_value": "some_string_ABC}"}
```



First update `config["meta_info"]`

In [10]:
utils.recursively_update_config(config["meta_info"], {
    "session_id": session_id,
    "model_purpose": config["meta_info"]["model_purpose"],
    "config_filepath": config_filepath,
    "notebook_filepath": notebook_filepath
})

Then use `config["meta_info"]` to update the rest.

In [11]:
utils.recursively_update_config(config, config["meta_info"])

## Session

Create a small dictionary with the session information. This will later be stored as a dictionary artifact with all the key run infomration

In [12]:
session = {
    "time_stamp": datetime.datetime.utcnow().isoformat()[:-3] + "Z",
    "run_by": getpass.getuser(),
    "meta_info": config["meta_info"],
    "installed_packages": get_installed_packages()
}

In [13]:
print("Session info")
print(json.dumps(session, indent=2))

Session info
{
  "time_stamp": "2019-02-25T11:56:38.762Z",
  "run_by": "karl.schriek",
  "meta_info": {
    "ml_engine": "h2o",
    "model_purpose": "test_h2o",
    "session_id": "818c0c6c1c404dddae908823f0d89946",
    "model_object_name": "test_h2o__818c0c6c1c404dddae908823f0d89946",
    "data_bunch_name": "array_789",
    "notebook_filepath": "C:\\Users\\karl.schriek\\PycharmProjects\\mercury-ml-github\\examples\\h2o\\fit.ipynb",
    "config_filepath": "C:\\Users\\karl.schriek\\PycharmProjects\\mercury-ml-github\\examples\\h2o\\config/fit_config_h2o.json"
  },
  "installed_packages": [
    "absl-py==0.7.0",
    "astor==0.7.1",
    "atomicwrites==1.3.0",
    "attrs==18.2.0",
    "backcall==0.1.0",
    "bleach==3.1.0",
    "boto3==1.9.86",
    "botocore==1.12.86",
    "cachetools==3.1.0",
    "certifi==2018.11.29",
    "chardet==3.0.4",
    "colorama==0.4.1",
    "decorator==4.3.2",
    "defusedxml==0.5.0",
    "docutils==0.14",
    "entrypoints==0.3",
    "future==0.17.1",
    "gast==

## Initialization

These are the functions or classes we will be using in this workflow. We get / instatiate them all at the beginning using parameters under `config["initialization"]`.

Here we use mainly use `getattr` to fetch them via the `containers` module based on a string input in the config file. Providers could however also be fetched directly. The following three methods are all equivalent:

```python
# 1. (what we are using in this notebook)
from mercury_ml.common import containers as common_containers
source_reader=getattr(common_containers.SourceReaders, "read_pandas_data_set")

# 2. 
from mercury_ml.common import containers as common_containers
source_reader=common_containers.SourceReaders.read_pandas_data_set

# 3.
from mercury_ml.common.providers.source_reading import read_pandas_data_set
source_reader=read_pandas_data_set
```


### Helpers

These helper functions will create instantiate class providers (`create_and_log`) or fetch function providers (`get_and_log`) based on the parameters provided

In [14]:
def create_and_log(container, class_name, params):
    provider = getattr(container, class_name)(**params)
    print("{}.{}".format(container.__name__, class_name))
    print("params: ", json.dumps(params, indent=2))
    return provider

def get_and_log(container, function_name):
    provider = getattr(container, function_name)
    print("{}.{}".format(container.__name__, function_name))
    return provider

### Common

These are providers that are universally relevant, regardless of which Machine Learning engine is used.

In [15]:
# a function for storing dictionary artifacts to local disk
store_artifact_locally = get_and_log(common_containers.LocalArtifactStorers,
                                     config["init"]["store_artifact_locally"]["name"])

LocalArtifactStorers.store_dict_json


In [16]:
# a function for storing data-frame-like artifacts to local disk
store_prediction_artifact_locally = get_and_log(common_containers.LocalArtifactStorers,
                                                config["init"]["store_prediction_artifact_locally"]["name"])

LocalArtifactStorers.store_h2o_frame


In [17]:
# a function for copy artifacts from local disk to a remote store
copy_from_local_to_remote = get_and_log(common_containers.ArtifactCopiers, config["init"]["copy_from_local_to_remote"]["name"])

ArtifactCopiers.copy_from_disk_to_disk


In [18]:
# a function for reading source data. When called it will return an instance of type DataBunch 
read_source_data_set = get_and_log(common_containers.SourceReaders, config["init"]["read_source_data"]["name"])

SourceReaders.read_disk_pandas


In [19]:
# a dictionary of functions that calculate custom metrics
custom_metrics_dict = {
    custom_metric_name: get_and_log(common_containers.CustomMetrics, custom_metric_name) for custom_metric_name in config["init"]["custom_metrics"]["names"]
}


CustomMetrics.evaluate_numpy_auc
CustomMetrics.evaluate_numpy_micro_auc


In [20]:
# a dictionary of functions that calculate custom label metrics
custom_label_metrics_dict = {
    custom_label_metric_name: get_and_log(common_containers.CustomLabelMetrics, custom_label_metric_name) for custom_label_metric_name in config["init"]["custom_label_metrics"]["names"]
}


CustomLabelMetrics.evaluate_numpy_accuracy
CustomLabelMetrics.evaluate_numpy_confusion_matrix


### H2O

In [21]:
# a function to initiate the h2o (or h2o sparkling) session
initiate_session = get_and_log(h2o_containers.SessionInitiators, config["init"]["initiate_session"]["name"])

SessionInitiators.get_or_create_h2o


In [22]:
# fetch a built-in h2o model
model = get_and_log(h2o_containers.ModelDefinitions, 
                    config["init"]["model_definition"]["name"])(**config["init"]["model_definition"]["params"])

ModelDefinitions.rf


In [23]:
# a function that fits an h2o model
fit = get_and_log(h2o_containers.ModelFitters, config["init"]["fit"]["name"])

ModelFitters.fit


In [24]:
# a dictionary of functions that save h2o models in various formats
save_model_dict = {
    save_model_function_name: get_and_log(h2o_containers.ModelSavers, save_model_function_name) for save_model_function_name in config["init"]["save_model"]["names"]
}


ModelSavers.save_h2o_model
ModelSavers.save_json_details
ModelSavers.save_pojo
ModelSavers.save_mojo
ModelSavers.save_pojo_jar
ModelSavers.save_mojo_jar


In [25]:
# a function that generates metrics from an h2o model
evaluate = get_and_log(h2o_containers.ModelEvaluators, config["init"]["evaluate"]["name"])

ModelEvaluators.evaluate


In [26]:
# a function that generates metrics from an h2o model
evaluate_threshold_metrics = get_and_log(h2o_containers.ModelEvaluators, config["init"]["evaluate_threshold_metrics"]["name"])

ModelEvaluators.evaluate_threshold_metrics


In [27]:
# a function that produces predictions using an h2o model
predict = get_and_log(h2o_containers.PredictionFunctions, config["init"]["predict"]["name"])

PredictionFunctions.predict


## Execution

Here we use the providers defined above to execute various tasks

### Save (formatted) config

In [28]:
tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote, config,
                      **config["exec"]["save_formatted_config"]["params"])

In [29]:
print("Config stored with following parameters")
print_dict(config["exec"]["save_formatted_config"]["params"])

Config stored with following parameters
{
  "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/session",
  "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/session",
  "filename": "config_formatted"
}


### Save Session

##### Save session info

In [30]:
tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote, session,
                                 **config["exec"]["save_session"]["params"])

In [31]:
print("Session dictionary stored with following parameters")
print_dict(config["exec"]["save_session"]["params"])

Session dictionary stored with following parameters
{
  "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/session",
  "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/session",
  "filename": "session"
}


##### Save session artifacts

In [32]:
for artifact_dict in config["exec"]["save_session_artifacts"]["artifacts"]:
    
    artifact_dir=os.path.dirname(artifact_dict["artifact_path"]) 
    artifact_filename=os.path.basename(artifact_dict["artifact_path"])
    
    # save to local artifact store
    common_containers.ArtifactCopiers.copy_from_disk_to_disk(
        source_dir=artifact_dir,
        target_dir=artifact_dict["local_dir"],
        filename=artifact_filename,
        overwrite=False,
        delete_source=False)

    # copy to remote artifact store
    copy_from_local_to_remote(source_dir=artifact_dict["local_dir"],
                              target_dir=artifact_dict["remote_dir"],
                              filename=artifact_filename,
                              overwrite=False,
                              delete_source=False)
    

In [34]:
print("Session artifacts stored with following parameters")
print_dict(config["exec"]["save_session_artifacts"])

Session artifacts stored with following parameters
{
  "artifacts": [
    {
      "artifact_path": "C:\\Users\\karl.schriek\\PycharmProjects\\mercury-ml-github\\examples\\h2o\\config/fit_config_h2o.json",
      "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/session",
      "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/session"
    },
    {
      "artifact_path": "C:\\Users\\karl.schriek\\PycharmProjects\\mercury-ml-github\\examples\\h2o\\fit.ipynb",
      "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/session",
      "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/session"
    }
  ]
}


### Start H2O

In [35]:
initiate_session(**config["exec"]["initiate_session"]["params"])

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,37 secs
H2O cluster timezone:,Europe/Berlin
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.1
H2O cluster version age:,1 month and 27 days
H2O cluster name:,karl.schriek
H2O cluster total nodes:,1
H2O cluster free memory:,3.531 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


<module 'h2o' from 'c:\\users\\karl.schriek\\appdata\\local\\conda\\conda\\envs\\mercury_ml\\lib\\site-packages\\h2o\\__init__.py'>

### Get source data

In [36]:
data_bunch_source = tasks.read_train_valid_test_data_bunch(read_source_data_set,**config["exec"]["read_source_data"]["params"] )
print("Source data read using following parameters: \n")
print_dict(config["exec"]["read_source_data"]["params"])

Source data read using following parameters: 

{
  "train_params": {
    "path": "./example_data/array_789/train.csv",
    "input_format": ".csv",
    "full_data_columns": [
      "ID",
      "ID2",
      "field1_num",
      "field2_num",
      "field3_num",
      "field4_factor",
      "field5_factor",
      "field6_target"
    ],
    "index_columns": [
      "ID",
      "ID2"
    ],
    "features_columns": [
      "field4_factor",
      "field5_factor",
      "field1_num",
      "field2_num",
      "field3_num"
    ],
    "targets_columns": [
      "field6_target"
    ]
  },
  "valid_params": {
    "path": "./example_data/array_789/valid.csv",
    "input_format": ".csv",
    "full_data_columns": [
      "ID",
      "ID2",
      "field1_num",
      "field2_num",
      "field3_num",
      "field4_factor",
      "field5_factor",
      "field6_target"
    ],
    "index_columns": [
      "ID",
      "ID2"
    ],
    "features_columns": [
      "field4_factor",
      "field5_factor",
     

In [37]:
print("Read data_bunch consists of: \n")
print_data_bunch(data_bunch_source)

Read data_bunch consists of: 

train <DataSet>
  full_data <PandasDataWrapper>
  index <PandasDataWrapper>
  features <PandasDataWrapper>
  targets <PandasDataWrapper>

valid <DataSet>
  full_data <PandasDataWrapper>
  index <PandasDataWrapper>
  features <PandasDataWrapper>
  targets <PandasDataWrapper>

test <DataSet>
  full_data <PandasDataWrapper>
  index <PandasDataWrapper>
  features <PandasDataWrapper>
  targets <PandasDataWrapper>



### Fit model

##### Transform data

In [38]:
data_bunch_fit = maybe_transform(data_bunch_source, config["exec"]["fit"].get("pre_execution_transformation"))

print("Data transformed with following parameters: \n")
print_dict(config["exec"]["fit"].get("pre_execution_transformation"))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Data transformed with following parameters: 

{
  "data_set_names": [
    "train",
    "valid",
    "test"
  ],
  "transform_then_slice": true,
  "params": {
    "transform_to": "h2o",
    "full_data_wrapper_params": {
      "factor_columns_list": [
        "field4_factor",
        "field5_factor",
        "field6_target"
      ]
    },
    "data_wrapper_names": [
      "index",
      "features",
      "targets"
    ]
  }
}


In [39]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_fit)

Transformed data_bunch consists of: 

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>



##### Perform fitting

In [40]:
model = fit(model = model,
            data_bunch = data_bunch_fit,
            **config["exec"]["fit"]["params"])

drf Model Build progress: |███████████████████████████████████████████████| 100%


### Save model

In [41]:
for model_format, save_model in save_model_dict.items():
    
    tasks.store_model(save_model=save_model,
                      model=model,
                      copy_from_local_to_remote = copy_from_local_to_remote,
                      **config["exec"]["save_model"][model_format]
                      )

C:\Users\karl.schriek\PycharmProjects\mercury-ml-github\examples\h2o\example_results\local\818c0c6c1c404dddae908823f0d89946\models


In [42]:
print("Model saved with following paramters: \n")
print_dict(config["exec"]["save_model"])

Model saved with following paramters: 

{
  "save_h2o_model": {
    "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/models",
    "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/models",
    "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__h2o",
    "extension": "",
    "overwrite_remote": true
  },
  "save_json_details": {
    "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/models",
    "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/models",
    "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__json_details",
    "extension": ".json",
    "overwrite_remote": true
  },
  "save_pojo": {
    "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/models",
    "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/models",
    "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__pojo",
    "extension": ".java",
    "overwrite_remote": true
  }

### Evaluate metrics

##### Transform data

In [43]:
data_bunch_metrics = maybe_transform(data_bunch_fit, config["exec"]["evaluate"].get("pre_execution_transformation"))

print("Data transformed with following parameters: \n")
print_dict(config["exec"]["evaluate"].get("pre_execution_transformation"))

Data transformed with following parameters: 

null


In [44]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_metrics)

Transformed data_bunch consists of: 

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>



##### Calculate metrics

In [45]:
metrics = {}
for data_set_name in config["exec"]["evaluate"]["data_set_names"]:
    data_set = getattr(data_bunch_metrics, data_set_name)
    metrics[data_set_name] = evaluate(model, data_set, data_set_name, **config["exec"]["evaluate"]["params"])

In [46]:
print("Resulting metrics: \n")
print_dict(metrics)

Resulting metrics: 

{
  "test": {
    "logloss": 0.026988748739291035,
    "mse": 0.0013250000031590453,
    "rmse": 0.03640054948979542,
    "auc": 1.0,
    "gini": 1.0,
    "r2": 0.9936904761754332
  }
}


##### Calculate metrics

In [47]:
threshold_metrics = {}
for data_set_name in config["exec"]["evaluate"]["data_set_names"]:
    data_set = getattr(data_bunch_metrics, data_set_name)
    threshold_metrics[data_set_name] = evaluate_threshold_metrics(model, data_set, data_set_name,
                                                                  **config["exec"]["evaluate_threshold_metrics"]["params"])

In [48]:
print("Resulting metrics: \n")
print_dict(threshold_metrics)

Resulting metrics: 

{
  "test": {
    "threshold": {
      "0": 0.9519999998807908,
      "1": 0.9344999998807907,
      "2": 0.0625,
      "3": 0.01449999988079076,
      "4": 0.0024999999999999467
    },
    "f1": {
      "0": 0.8,
      "1": 1.0,
      "2": 0.8571428571428571,
      "3": 0.6666666666666666,
      "4": 0.4615384615384615
    },
    "f2": {
      "0": 0.7142857142857142,
      "1": 1.0,
      "2": 0.9375,
      "3": 0.8333333333333334,
      "4": 0.6818181818181818
    },
    "f0point5": {
      "0": 0.9090909090909091,
      "1": 1.0,
      "2": 0.7894736842105263,
      "3": 0.5555555555555556,
      "4": 0.3488372093023256
    },
    "accuracy": {
      "0": 0.9,
      "1": 1.0,
      "2": 0.9,
      "3": 0.7,
      "4": 0.3
    },
    "precision": {
      "0": 1.0,
      "1": 1.0,
      "2": 0.75,
      "3": 0.5,
      "4": 0.3
    },
    "recall": {
      "0": 0.6666666666666666,
      "1": 1.0,
      "2": 1.0,
      "3": 1.0,
      "4": 1.0
    },
    "specific

### Save metrics

In [49]:
for data_set_name, params in config["exec"]["save_metrics"]["data_sets"].items():
    tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote, metrics[data_set_name], **params)

In [50]:
for data_set_name, params in config["exec"]["save_threshold_metrics"]["data_sets"].items():
    tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote, metrics[data_set_name], **params)

### Predict

##### Transform data

In [51]:
data_bunch_predict = maybe_transform(data_bunch_metrics, config["exec"]["predict"].get("pre_execution_transformation"))
    
print("Data transformed with following parameters: \n")
print_dict(config["exec"]["predict"].get("pre_execution_transformation"))

Data transformed with following parameters: 

null


In [52]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_predict)

Transformed data_bunch consists of: 

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>



##### Perform prediction

In [53]:
for data_set_name in config["exec"]["predict"]["data_set_names"]:
    data_set = getattr(data_bunch_predict, data_set_name)
    data_set.predictions = predict(model=model, data_set=data_set, **config["exec"]["predict"]["params"])

drf prediction progress: |████████████████████████████████████████████████| 100%


In [54]:
print("Data predicted with following parameters: \n")
print_dict(config["exec"]["predict"].get("params"))


Data predicted with following parameters: 

{
  "return_columns": [
    "p1"
  ]
}


In [55]:
data_bunch_predict.test.predictions.underlying

p1
0.0625
0.0025
0.0025
0.0025
0.0025
0.0145
0.0145
0.952
0.952
0.9345




### Evaluate custom metrics

##### Transform data

In [56]:
data_bunch_custom_metrics = maybe_transform(data_bunch_predict, 
                                            config["exec"]["evaluate_custom_metrics"].get("pre_execution_transformation"))

In [57]:
print("Data transformed with following parameters: \n")
print_dict(config["exec"]["evaluate_custom_metrics"].get("pre_execution_transformation"))

Data transformed with following parameters: 

{
  "data_set_names": [
    "test"
  ],
  "params": {
    "transform_to": "numpy",
    "data_wrapper_params": {
      "predictions": {},
      "index": {},
      "targets": {}
    }
  }
}


In [58]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_custom_metrics)


Transformed data_bunch consists of: 

test <DataSet>
  predictions <NumpyDataWrapper>
  index <NumpyDataWrapper>
  targets <NumpyDataWrapper>



##### Calculate custom metrics


In [59]:
custom_metrics = {}
for data_set_name in config["exec"]["evaluate_custom_metrics"]["data_set_names"]:
    data_set = getattr(data_bunch_custom_metrics, data_set_name)
    custom_metrics[data_set_name]  = tasks.evaluate_metrics(data_set, custom_metrics_dict)


In [60]:
print("Resulting custom metrics: \n")
print_dict(custom_metrics)


Resulting custom metrics: 

{
  "test": {
    "evaluate_numpy_auc": 1.0,
    "evaluate_numpy_micro_auc": 1.0
  }
}


##### Calculate custom label metrics

In [61]:
custom_label_metrics = {}
for data_set_name in config["exec"]["evaluate_custom_label_metrics"]["data_set_names"]:
    data_set = getattr(data_bunch_custom_metrics, data_set_name)
    custom_label_metrics[data_set_name] = tasks.evaluate_label_metrics(data_set, custom_label_metrics_dict)

In [62]:
print("Resulting custom label metrics: \n")
print_dict(custom_label_metrics)

Resulting custom label metrics: 

{
  "test": {
    "Accuracy": {
      "field6_target": 1.0
    },
    "ConfMat_Count_field6_target": {
      "field6_target": 20
    },
    "ConfMat_Rate_field6_target": {
      "field6_target": 1.0
    }
  }
}


In [63]:
for data_set_name, params in config["exec"]["save_custom_metrics"]["data_sets"].items():
    tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote,
                          custom_metrics[data_set_name], **params)

In [64]:
print("Custom metrics saved with following parameters: \n")
print_dict(config["exec"]["save_custom_metrics"])

Custom metrics saved with following parameters: 

{
  "data_sets": {
    "test": {
      "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/metrics/test",
      "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/metrics/test",
      "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__test__custom_metrics"
    }
  }
}


In [65]:
for data_set_name, params in config["exec"]["save_custom_label_metrics"]["data_sets"].items():
    tasks.store_artifacts(store_artifact_locally, copy_from_local_to_remote,
                          custom_label_metrics[data_set_name], **params)

In [66]:
print("Custom label metrics saved with following parameters: \n")
print_dict(config["exec"]["save_custom_label_metrics"])

Custom label metrics saved with following parameters: 

{
  "data_sets": {
    "test": {
      "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/metrics/test",
      "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/metrics/test",
      "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__test__custom_label_metrics"
    }
  }
}


### Prepare predictions for storage

##### Transform data

In [67]:
data_bunch_prediction_preparation = maybe_transform(data_bunch_predict, 
                                                    config["exec"]["prepare_predictions_for_storage"].get("pre_execution_transformation"))

In [68]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_prediction_preparation)

Transformed data_bunch consists of: 

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>
  predictions <H2ODataWrapper>



##### Prepare predictions and targets

In [69]:
for data_set_name in config["exec"]["prepare_predictions_for_storage"]["data_set_names"]:
    data_set = getattr(data_bunch_prediction_preparation, data_set_name)
    data_set.add_data_wrapper_via_concatenate(**config["exec"]["prepare_predictions_for_storage"]["params"]["predictions"])
    data_set.add_data_wrapper_via_concatenate(**config["exec"]["prepare_predictions_for_storage"]["params"]["targets"])

In [70]:
print_data_bunch(data_bunch_prediction_preparation)

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>
  predictions <H2ODataWrapper>
  predictions_for_storage <H2ODataWrapper>
  targets_for_storage <H2ODataWrapper>



### Save predictions

##### Transform data

In [71]:
data_bunch_prediction_storage = maybe_transform(data_bunch_prediction_preparation, 
                                                config["exec"]["save_predictions"].get("pre_execution_transformation"))

In [72]:
print("Transformed data_bunch consists of: \n")
print_data_bunch(data_bunch_prediction_storage)

Transformed data_bunch consists of: 

train <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

valid <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>

test <DataSet>
  full_data <H2ODataWrapper>
  index <H2ODataWrapper>
  features <H2ODataWrapper>
  targets <H2ODataWrapper>
  predictions <H2ODataWrapper>
  predictions_for_storage <H2ODataWrapper>
  targets_for_storage <H2ODataWrapper>



##### Save predictions

In [73]:
for data_set_name, data_set_params in config["exec"]["save_predictions"]["data_sets"].items():
    data_set = getattr(data_bunch_prediction_storage, data_set_name)
    data_wrapper = getattr(data_set, data_set_params["data_wrapper_name"])
    
    data_to_store = data_wrapper.underlying
   
    tasks.store_artifacts(store_prediction_artifact_locally, copy_from_local_to_remote,
                          data_to_store, **data_set_params["params"])

Export File progress: |███████████████████████████████████████████████████| 100%


In [74]:
print("Predictions saved with following parameters: \n")
print_dict(config["exec"]["save_predictions"])

Predictions saved with following parameters: 

{
  "data_sets": {
    "test": {
      "data_wrapper_name": "predictions_for_storage",
      "params": {
        "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/predictions/test",
        "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/predictions/test",
        "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__test__predictions"
      }
    }
  }
}


##### Save targets

In [75]:
for data_set_name, data_set_params in config["exec"]["save_targets"]["data_sets"].items():
    data_set = getattr(data_bunch_prediction_storage, data_set_name)
    data_wrapper = getattr(data_set, data_set_params["data_wrapper_name"])
    
    data_to_store = data_wrapper.underlying
   
    tasks.store_artifacts(store_prediction_artifact_locally, copy_from_local_to_remote,
                          data_to_store, **data_set_params["params"])

Export File progress: |███████████████████████████████████████████████████| 100%


In [76]:
print("Targets saved with following parameters: \n")
print_dict(config["exec"]["save_targets"])

Targets saved with following parameters: 

{
  "data_sets": {
    "test": {
      "data_wrapper_name": "targets_for_storage",
      "params": {
        "local_dir": "./example_results/local/818c0c6c1c404dddae908823f0d89946/predictions/test",
        "remote_dir": "./example_results/remote/818c0c6c1c404dddae908823f0d89946/predictions/test",
        "filename": "test_h2o__818c0c6c1c404dddae908823f0d89946__test__targets"
      }
    }
  }
}
