# Orchestrate BigQuery and AutoML tables with Kubeflow pipelines

In [47]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

from typing import NamedTuple

In [48]:
PROJECT_NAME = 'sandbox-235500'
BASE_IMAGE='gcr.io/%s/automltablesbase:dev' % PROJECT_NAME

In [62]:
@kfp.dsl.python_component(name='Delete table', base_image=BASE_IMAGE)
def delete_table(
    table_id: str):
    """Deletes BigQuery table"""
    
    import logging
    from google.cloud import bigquery
    
    logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
    
    client = bigquery.Client()
    client.delete_table(table_id, not_found_ok=True)
    
    logging.info("Deleted table: {}".format(table_id))
    
    

Jupyter docker magic is used to start a Kaniko job. The magic uses a default Kubernetes config.

## Create and run a pipeline

### Define a pipeline

In [63]:
from google.cloud import bigquery
import json

FEATURES_TABLE_ID = 'clv_features'
AUTOML_DATASET_NAME = 'CLVFeatures'

#data_preprocessing_op = kfp.components.load_component_from_url(
#        'https://raw.githubusercontent.com/jarokaz/CLVPipelines/master/components/prep_data/component.yaml')

data_preprocessing_op = kfp.components.load_component_from_file(
        '../components/prep_data/component.yaml')

delete_table_op = kfp.components.func_to_container_op(delete_table)


@dsl.pipeline(
    name='CLVTrainingPipeline',
    description='CLV Training Pipeline'
)
def clv_pipeline(
    project_id='', 
    dataset_id='', 
    transactions_table_id='',
    features_table_name='',
    threshold_date='',
    predict_end='',
    max_monetary=15000,
    automl_dataset_location='us-central1'
):


    data_preprocessing_task = data_preprocessing_op(
        project_id=project_id,
        dataset_id=dataset_id,
        transactions_table_id=transactions_table_id,
        features_table_name=features_table_name,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary
        )
    
    delete_table_task = delete_table_op(
        table_id = data_preprocessing_task.outputs['output_table_id'].ignore_type())
  

    


### Compile a pipeline

In [64]:
pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [65]:
#Specify pipeline argument values
arguments = {
    'project_id': 'sandbox-235500',
    'dataset_id': 'CLVDataset',
    'transactions_table_id': 'sandbox-235500.CLVDataset.transactions',
    'features_table_name': '',
    'threshold_date': '2011-08-08',
    'predict_end': '2011-12-12',
    'max_monetary': '15000'
}


HOST = 'http://localhost:8082/api/v1/namespaces/kubeflow/services/ml-pipeline:8888/proxy'
EXPERIMENT_NAME = 'CLV_TRAINING'

client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)

{'created_at': datetime.datetime(2019, 5, 10, 16, 38, 34, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': None,
 'id': '0d88d39c-7342-11e9-bf64-42010a800073',
 'metrics': None,
 'name': 'clv_pipeline run',
 'pipeline_spec': {'parameters': [{'name': 'threshold-date',
                                   'value': '2011-08-08'},
                                  {'name': 'max-monetary', 'value': '15000'},
                                  {'name': 'features-table-name',
                                   'value': None},
                                  {'name': 'project-id',
                                   'value': 'sandbox-235500'},
                                  {'name': 'dataset-id', 'value': 'CLVDataset'},
                                  {'name': 'transactions-table-id',
                                   'value': 'sandbox-235500.CLVDataset.transactions'},
                                  {'name': 'predict-end',
                                   'valu