# Orchestrate BigQuery and AutoML tables with Kubeflow pipelines

In [6]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

## Create a base image to be used by components
The image is created by Kaniko Kubernetes service. The image contains the libraries required to interface with BigQuery, Storage and AutoML tables services.

In [7]:
# Configure a staging directory for Kaniko
STAGING_DIR = 'gs://jksandbox/pipelinestest/out'
PROJECT_NAME = 'sandbox-235500'
# Set the base image name
EF_IMAGE='gcr.io/%s/automltablesbase:dev' % PROJECT_NAME

Jupyter docker magic is used to start a Kaniko job. The magic uses a default Kubernetes config.

In [9]:
%%docker {EF_IMAGE} {STAGING_DIR}
FROM tensorflow/tensorflow:latest-py3
RUN pip3 install --upgrade pandas
RUN pip3 install --upgrade google-cloud-storage
RUN pip3 install --upgrade google-cloud-automl

2019-04-26 20:32:26:INFO:Checking path: gs://jksandbox/pipelinestest/out...
2019-04-26 20:32:26:INFO:Generate build files.
2019-04-26 20:32:27:INFO:Start a kaniko job for build.
2019-04-26 20:32:27:INFO:Found local kubernetes config. Initialized with kube_config.
2019-04-26 20:32:32:INFO:5 seconds: waiting for job to complete
2019-04-26 20:32:37:INFO:10 seconds: waiting for job to complete
2019-04-26 20:32:42:INFO:15 seconds: waiting for job to complete
2019-04-26 20:32:47:INFO:20 seconds: waiting for job to complete
2019-04-26 20:32:52:INFO:25 seconds: waiting for job to complete
2019-04-26 20:32:57:INFO:30 seconds: waiting for job to complete
2019-04-26 20:33:02:INFO:35 seconds: waiting for job to complete
2019-04-26 20:33:07:INFO:40 seconds: waiting for job to complete
2019-04-26 20:33:12:INFO:45 seconds: waiting for job to complete
2019-04-26 20:33:17:INFO:50 seconds: waiting for job to complete
2019-04-26 20:33:22:INFO:55 seconds: waiting for job to complete
2019-04-26 20:33:27:IN

In [18]:
from typing import NamedTuple
@dsl.python_component(
    name='BQ access',
    description='Run BQ query',
    base_image=EF_IMAGE)
def bquery(
    query: str,
    project_id: str,
    dataset_id: str,
    table_id: str,
    output: str) -> NamedTuple('BQResult', 
                               [('dest_table', str), 
                                ('total_rows', int),
                                ('total_bytes_processed', int),
                                ('schema', str)]):
    
    from google.cloud import bigquery
    
    client = bigquery.Client(project=project_id)
    
    job_config = bigquery.QueryJobConfig()
    table_ref = client.dataset(dataset_id).table(table_id)
    job_config.create_disposition = bigquery.job.CreateDisposition.CREATE_IF_NEEDED
    job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE
    job_config.destination = table_ref
    
    query_job = client.query(query, job_config=job_config)
    
    job_result = query_job.result()
    
    from collections import namedtuple
    result = namedtuple('BQResult', ['dest_table', 'total_rows', 'total_bytes_processed', 'schema'])
    return result(table_ref.path, job_result.total_rows, query_job.total_bytes_processed, [f.to_api_repr() for f in job_result.schema])
    

In [19]:
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'
PROJECT_ID = 'sandbox-235500'
GCS_WORKING_DIR = 'gs://jksandbox/sink' 
OUTPUT_PATH = '{}/bigquery/query/questions.csv'.format(GCS_WORKING_DIR)

In [20]:
result = bquery(QUERY, PROJECT_ID, dataset_id='', table_id='', output=OUTPUT_PATH)

BadRequest: 400 POST https://www.googleapis.com/bigquery/v2/projects/sandbox-235500/jobs: Invalid dataset ID "". Dataset IDs must be alphanumeric (plus underscores and dashes) and must be at most 1024 characters long.