### Install KubeFlow Pipelines SDK and import libraries

In [None]:
import sys
!{sys.executable}  -m pip install https://storage.googleapis.com/ml-pipeline/release/0.1.17/kfp.tar.gz

In [None]:

!{sys.executable} -m pip install https://storage.googleapis.com/ml-pipeline/release/0.1.17/kfp.tar.gz

Python 3.5.3 (default, Sep 27 2018, 17:25:39) 
[GCC 6.3.0 20170516] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> 

In [1]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

### Create image to be used by components. 
Using tensorflow's py3 image as base and installing requiered libraries (automl, storage, pandas)

In [2]:
OUTPUT_DIR = 'gs://jksandbox/pipelinestest/out'
PROJECT_NAME = 'sandbox-235500'
EF_IMAGE='gcr.io/%s/automltables:dev' % PROJECT_NAME

In [3]:
%%docker {EF_IMAGE} {OUTPUT_DIR}
FROM tensorflow/tensorflow:latest-py3
RUN pip3 install --upgrade pandas
RUN pip3 install --upgrade google-cloud-storage
RUN pip3 install --upgrade google-cloud-automl

2019-04-26 18:58:52:INFO:Checking path: gs://jksandbox/pipelinestest/out...
2019-04-26 18:58:52:INFO:Generate build files.
2019-04-26 18:58:52:INFO:Start a kaniko job for build.
2019-04-26 18:58:52:INFO:Found local kubernetes config. Initialized with kube_config.
2019-04-26 18:58:58:INFO:5 seconds: waiting for job to complete
2019-04-26 18:59:03:INFO:10 seconds: waiting for job to complete
2019-04-26 18:59:08:INFO:15 seconds: waiting for job to complete
2019-04-26 18:59:13:INFO:20 seconds: waiting for job to complete
2019-04-26 18:59:18:INFO:25 seconds: waiting for job to complete
2019-04-26 18:59:23:INFO:30 seconds: waiting for job to complete
2019-04-26 18:59:28:INFO:35 seconds: waiting for job to complete
2019-04-26 18:59:33:INFO:40 seconds: waiting for job to complete
2019-04-26 18:59:38:INFO:45 seconds: waiting for job to complete
2019-04-26 18:59:43:INFO:50 seconds: waiting for job to complete
2019-04-26 18:59:48:INFO:55 seconds: waiting for job to complete
2019-04-26 18:59:53:IN

### Create reusable components for running different steps in AutoML Tables

In [None]:
 def create_dataset(
    project_id: str,
    location: str,
    display_name: str) -> str:
        
    from google.cloud import automl_v1beta1
    
    client = automl_v1beta1.AutoMlClient()
    
    location_path = client.location_path(project_id, location)
    
    create_dataset_response = client.create_dataset(
        location_path,
        {
            'display_name': display_name,
            'tables_dataset_metadata': {}})
    
    return(create_dataset_response.name)
    
compiler.build_python_component(
    component_func = create_dataset,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-create-dataset.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-create-dataset:latest')

In [None]:
 def import_data(
    dataset_name: str,
    source: str,
    input_uri: str) -> str:
        
    from google.cloud import automl_v1beta1
    
    client = automl_v1beta1.AutoMlClient()
    
    input_config = {
        source: {
            'input_uri': input_uri
        }}
    
    import_data_response = client.import_data(
        dataset_name,
        input_config)
    
    import_data_response.result()
    
    return(dataset_name)
    
compiler.build_python_component(
    component_func = import_data,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-import-data.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-import-data:latest')

In [None]:
 def column_specs(
    dataset_name: str,
    bucket: str) -> str:
        
    from google.cloud import automl_v1beta1
    import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types
    from google.cloud import storage
    import pickle
    import json
    import pandas as pd
    
    client = automl_v1beta1.AutoMlClient()
    
    list_table_specs_response = client.list_table_specs(dataset_name)
    table_specs = [s for s in list_table_specs_response]
    table_spec_name = table_specs[0].name
    list_column_specs_response = client.list_column_specs(table_spec_name)
    column_specs = {s.display_name: s for s in list_column_specs_response}
    
    file_blob = 'tables/column-specs/' + dataset_name + '.csv'
    
    ui_out = {
        'version': 1,
        'outputs': [
            {
                'type': 'table',
                'source': 'gs://'+bucket+'/' + file_blob,
                'header': ['Column', 'Type'],
                'format':'csv'}]}
    
    column_types  = pd.DataFrame({
        'Column':[x for x in column_specs.keys()],
        'Type':[data_types.TypeCode.Name(
            column_specs[x].data_type.type_code) for x in column_specs.keys()]
    })
    
    column_types.to_csv('data.csv', header=None, index=None)
    gcs_bucket = storage.Client().bucket(bucket)
    blob = gcs_bucket.blob(file_blob)
    blob.upload_from_filename('data.csv')
    
    
    with open(
        '/mlpipeline-ui-metadata.json',
        'w') as fp:
        json.dump(ui_out, fp)
    
    
    with open(
        'data.pk',
        mode='wb') as fp:
        pickle.dump(column_specs, fp)

    
    file_pickle = 'tables/column-specs/' + dataset_name + '.pk'
    
    blob = gcs_bucket.blob(file_pickle)
    blob.upload_from_filename('data.pk')
    
    return(file_pickle)
    
compiler.build_python_component(
    component_func = column_specs,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-column-specs.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-column-specs:latest')

In [None]:
 def update_column(
    bucket: str,
    column_specs_file: str,
    column_name: str,
    column_type: str) -> str:
        
    from google.cloud import automl_v1beta1
    import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types
    from google.cloud import storage
    import pickle
    import json
    import pandas as pd
    
    gcs_bucket = storage.Client().bucket(bucket)
    blob = gcs_bucket.blob(column_specs_file)
    
    with open(
        'specs.pk',
        mode='wb')  as fp:
        blob.download_to_file(fp)
    
    with open(
        'specs.pk',
        mode='rb') as fp:
        column_specs = pickle.load(fp)
    
    client = automl_v1beta1.AutoMlClient()
    
    
    update_column_spec_dict = {
        "name": column_specs[column_name].name,
        "data_type": {
            "type_code": column_type
        }
    }
    column_specs[column_name] = client.update_column_spec(update_column_spec_dict)
    
    file_blob = 'tables/column-specs/' + column_specs[column_name].name + '.csv'
    
    ui_out = {
        'version': 1,
        'outputs': [
            {
                'type': 'table',
                'source': 'gs://'+bucket+'/' + file_blob,
                'header': ['Column', 'Type'],
                'format':'csv'}]}
    
    column_types  = pd.DataFrame({
        'Column':[x for x in column_specs.keys()],
        'Type':[data_types.TypeCode.Name(
            column_specs[x].data_type.type_code) for x in column_specs.keys()]
    })
    
    column_types.to_csv('data.csv', header=None, index=None)
    blob = gcs_bucket.blob(file_blob)
    blob.upload_from_filename('data.csv')
    
    
    with open(
        '/mlpipeline-ui-metadata.json',
        'w') as fp:
        json.dump(ui_out, fp)
    
    
    with open(
        'data.pk',
        mode='wb') as fp:
        pickle.dump(column_specs, fp)

    
    file_pickle = 'tables/column-specs/' + column_specs[column_name].name + '.pk'
    
    blob = gcs_bucket.blob(file_pickle)
    blob.upload_from_filename('data.pk')
    
    return(file_pickle)
    
compiler.build_python_component(
    component_func = update_column,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-update-column.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-update-column:latest')

In [None]:
 def update_dataset(
    dataset_name: str,
    bucket: str,
    column_specs_file: str,
    label_column: str,
    split_column: str) -> str:
        
    from google.cloud import automl_v1beta1
    from google.cloud import storage
    import pickle
    
    gcs_bucket = storage.Client().bucket(bucket)
    blob = gcs_bucket.blob(column_specs_file)
    
    with open(
        'specs.pk',
        mode='wb')  as fp:
        blob.download_to_file(fp)
    
    with open(
        'specs.pk',
        mode='rb') as fp:
        column_specs = pickle.load(fp)
    
    client = automl_v1beta1.AutoMlClient()
    
    label_column_spec = column_specs[label_column]
    label_column_id = label_column_spec.name.rsplit('/', 1)[-1]

    split_column_spec = column_specs[split_column]
    split_column_id = split_column_spec.name.rsplit('/', 1)[-1]

    update_dataset_dict = {
        'name': dataset_name,
        'tables_dataset_metadata': {
            'target_column_spec_id': label_column_id,
            'ml_use_column_spec_id': split_column_id,
        }
    }
    client.update_dataset(update_dataset_dict)
    
    return(column_specs_file)
    
compiler.build_python_component(
    component_func = update_dataset,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-update-dataset.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-update-dataset:latest')

In [None]:
 def create_model(
    project_id: str,
    location: str,
    display_name: str,
    train_hours: str,
    optimization_objective: str,
    columns_to_ignore: str,
    dataset_name: str,
    bucket: str,
    column_specs_file: str,
    label_column: str,
    split_column: str) -> str:
        
    from google.cloud import automl_v1beta1
    from google.cloud import storage
    import pickle
    import json
    
    gcs_bucket = storage.Client().bucket(bucket)
    blob = gcs_bucket.blob(column_specs_file)
    
    with open(
        'specs.pk',
        mode='wb')  as fp:
        blob.download_to_file(fp)
    
    with open(
        'specs.pk',
        mode='rb') as fp:
        column_specs = pickle.load(fp)
    
    client = automl_v1beta1.AutoMlClient()
    
    location_path = client.location_path(project_id, location)
    
    feat_list = list(column_specs.keys())
    feat_list.remove(label_column)
    feat_list.remove(split_column)
    for c in json.loads(columns_to_ignore):
        feat_list.remove(c)

    model_dict = {
        'display_name': display_name,
        'dataset_id': dataset_name.rsplit('/', 1)[-1],
        'tables_model_metadata': {
          'train_budget_milli_node_hours':int(train_hours) * 1000,
          'optimization_objective': optimization_objective,
          'target_column_spec': column_specs[label_column],
          'input_feature_column_specs': [
                column_specs[x] for x in feat_list]}
        }

    create_model_response = client.create_model(location_path, model_dict)
    create_model_result = create_model_response.result()
    return(create_model_result.name)


compiler.build_python_component(
    component_func = create_model,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-create-model.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-create-model:latest')

In [None]:
 def evaluate_model(
    model_name: str,
    bucket: str) -> str:
        
    from google.cloud import automl_v1beta1
    from google.cloud import storage
    import pandas as pd
    import json
    
    client = automl_v1beta1.AutoMlClient()
    
    
    file_blob = 'tables/evaluate-model/' + model_name + '.csv'

    ui_out = {
        'version': 1,
        'outputs': [
            {
                'type': 'table',
                'source': 'gs://'+bucket+'/' + file_blob,
                'header': ['Feature', 'Importance'],
                'format':'csv'}]}

    model = client.get_model(model_name)
    feature_list = [(
        x.feature_importance,
        x.column_display_name
    ) for x in model.tables_model_metadata.tables_model_column_info]

    feature_list.sort(reverse=True)


    feature_importance  = pd.DataFrame({
        'Feature':[x[1] for x in feature_list],
        'Importance':[x[0] for x in feature_list]})

    feature_importance.to_csv('data.csv', header=None, index=None)
    gcs_bucket = storage.Client().bucket(bucket)
    blob = gcs_bucket.blob(file_blob)
    blob.upload_from_filename('data.csv')


    with open(
        '/mlpipeline-ui-metadata.json',
        'w') as fp:
        json.dump(ui_out, fp)
    
    
    
    metrics = [x for x in client.list_model_evaluations(model_name)][-1]
    return(metrics.regression_evaluation_metrics)
    
compiler.build_python_component(
    component_func = evaluate_model,
    staging_gcs_path = OUTPUT_DIR,
    base_image=EF_IMAGE,
    target_component_file='component-evaluate-model.yaml',
    target_image = 'gcr.io/' + PROJECT_NAME + '/component-evaluate-model:latest')

### Sample pipeline (Energy Price Forecasting) that uses the AutoML Tables components to build a model

In [None]:
@dsl.pipeline(
    name='AutoML Tables',
    description='AutoML Tables Pipeline')
def automl_tables_pipeline(
    project_id = dsl.PipelineParam(
        'project_id',
        value='energy-forecasting'),
    location = dsl.PipelineParam(
        'location',
        value='us-central1'),
    bucket = dsl.PipelineParam(
        'bucket',
        value='energy-forecasting'),
    dataset_display_name = dsl.PipelineParam(
        'dataset_display_name',
        value='testdataset'),
    data_source = dsl.PipelineParam(
        'data_source',
        value='bigquery_source'),
    data_input_uri = dsl.PipelineParam(
        'data_input_uri',
        value='bq://energy-forecasting.Energy.automldata'),
    column_to_update_name = dsl.PipelineParam(
        'column_to_update_name',
        value='hour'),
    column_to_update_type = dsl.PipelineParam(
        'column_to_update_type',
        value='CATEGORY'),
    label_column = dsl.PipelineParam(
        'label_column',
        value='price'),
    split_column = dsl.PipelineParam(
        'split_column',
        value='split'),
    model_display_name = dsl.PipelineParam(
        'model_display_name',
        value='testmodel'),
    model_train_hours = dsl.PipelineParam(
        'model_train_hours',
        value='1'),
    model_optimization_objective = dsl.PipelineParam(
        'model_optimization_objective',
        value='MINIMIZE_MAE'),
    model_columns_to_ignore = dsl.PipelineParam(
        'model_columns_to_ignore',
        value='["date_utc"]'),
):
    
    CreateDatasetOp = kfp.components.load_component('component-create-dataset.yaml')
    ImportDataOp = kfp.components.load_component('component-import-data.yaml')
    ColumnSpecsOp = kfp.components.load_component('component-column-specs.yaml')
    UpdateColumnOp = kfp.components.load_component('component-update-column.yaml')
    UpdateDatasetOp = kfp.components.load_component('component-update-dataset.yaml')
    CreateModelOp = kfp.components.load_component('component-create-model.yaml')
    EvaluateModelOp = kfp.components.load_component('component-evaluate-model.yaml')
    
    cd_op = CreateDatasetOp(
        project_id,
        location,
        dataset_display_name).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    id_op = ImportDataOp(
        cd_op.output,
        data_source,
        data_input_uri).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    cs_op = ColumnSpecsOp(
        id_op.output,
        bucket).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    uc_op = UpdateColumnOp(
        bucket,
        cs_op.output,
        column_to_update_name,
        column_to_update_type).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    ud_op = UpdateDatasetOp(
        id_op.output,
        bucket,
        uc_op.output,
        label_column,
        split_column).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    cm_op = CreateModelOp(
        project_id,
        location,
        model_display_name,
        model_train_hours,
        model_optimization_objective,
        model_columns_to_ignore,
        id_op.output,
        bucket,
        ud_op.output,
        label_column,
        split_column).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    em_op = EvaluateModelOp(
        cm_op.output,
        bucket).apply(
        gcp.use_gcp_secret('user-gcp-sa'))
    
    
compiler.Compiler().compile(automl_tables_pipeline, 'automl-tables-pipeline.tar.gz')