# Orchestrate BigQuery and AutoML tables with Kubeflow pipelines

In [1]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

## Create a base image to be used by lightweight components
The image is created by Kaniko Kubernetes service. The image contains the libraries required to interface with BigQuery, Storage and AutoML tables services.

In [38]:
# Configure a staging directory for Kaniko
STAGING_DIR = 'gs://jksandbox/pipelinestest/out'
PROJECT_NAME = 'sandbox-235500'
# Set the base image name
EF_IMAGE='gcr.io/%s/automltablesbase:dev' % PROJECT_NAME

Jupyter docker magic is used to start a Kaniko job. The magic uses a default Kubernetes config.

In [None]:
%%docker {EF_IMAGE} {STAGING_DIR}
FROM tensorflow/tensorflow:latest-py3
RUN pip3 install --upgrade pandas
RUN pip3 install --upgrade google-cloud-storage
RUN pip3 install --upgrade google-cloud-automl

### Create python lightweight components

In [45]:
from typing import NamedTuple
def automl_create_dataset(
    project_id: str,
    dataset_id: str,
    table_id: str) -> NamedTuple('DatasetInfo', 
                               [('name', str), 
                                ('total_rows', int),
                                ('total_columns', int)]):
    
    
    print("Project ID:", project_id)
    print("Dataset ID:", dataset_id)
    print("Table ID:", table_id)
    
    from collections import namedtuple
    result = namedtuple('DatasetInfo', ['name', 'total_rows', 'total_colums'])
    return result('test', 1000, 10)
    

In [46]:
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
PROJECT_ID = 'sandbox-235500'

In [47]:
result = automl_create_dataset(PROJECT_ID, dataset_id='kfp_tmp', table_id='test')
print(result)

Project ID: sandbox-235500
Dataset ID: kfp_tmp
Table ID: test
DatasetInfo(name='test', total_rows=1000, total_colums=10)


In [34]:
AutomlCreateDatasetOp = kfp.components.func_to_container_op(automl_create_dataset)

## Create and run a pipeline

### Define a pipeline

In [51]:
@dsl.pipeline(
    name='CLVPipeline',
    description='CLV Pipeline'
)
def clv_pipeline(
    query=QUERY, 
    project_id = PROJECT_ID, 
    dataset_id='', 
    table_id='', 
    dataset_location='US', 
    job_config=''
):
    
    BqQueryOp = kfp.components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e8524eefb138725fc06600d1956da0f4dd477178/components/gcp/bigquery/query/component.yaml')
    
    AutomlCreateDatasetOp = kfp.components.func_to_container_op(automl_create_dataset, base_image=EF_IMAGE)
    
    
    query_op = BqQueryOp(
        query=query, 
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id, 
        output_gcs_path='', 
        dataset_location=dataset_location, 
        job_config='').apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    cd_op = AutomlCreateDatasetOp(
        project_id=project_id,
        dataset_id=dataset_id,
        table_id=table_id).apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    cd_op.after(query_op)
    
    

### Compile a pipeline

In [52]:
import kfp.compiler as compiler

pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.zip'

compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [53]:
#Specify pipeline argument values
arguments = {}


HOST = 'http://localhost:8082/api/v1/namespaces/kubeflow/services/ml-pipeline:8888/proxy'
EXPERIMENT_NAME = 'CLV Pipeline Runs'

client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)

{'created_at': datetime.datetime(2019, 4, 27, 23, 36, 42, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': None,
 'id': '4fadd2f0-6945-11e9-b783-42010a800156',
 'metrics': None,
 'name': 'clv_pipeline run',
 'pipeline_spec': {'parameters': None,
                   'pipeline_id': None,
                   'pipeline_manifest': None,
                   'workflow_manifest': '{"apiVersion": '
                                        '"argoproj.io/v1alpha1", "spec": '
                                        '{"templates": [{"inputs": '
                                        '{"parameters": [{"name": '
                                        '"dataset-id"}, {"name": '
                                        '"project-id"}, {"name": '
                                        '"table-id"}]}, "container": {"args": '
                                        '["{{inputs.parameters.project-id}}", '
                                        '"{{inputs.parameters.dataset-id}}", '
 

In [28]:
EF_IMAGE

'gcr.io/sandbox-235500/automltablesbase:dev'