# Orchestrate BigQuery and AutoML tables with Kubeflow pipelines

In [1]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook


## Create a base image to be used by lightweight components
The image is created by Kaniko Kubernetes service. The image contains the libraries required to interface with BigQuery, Storage and AutoML tables services.

In [2]:
# Configure a staging directory for Kaniko
STAGING_DIR = 'gs://jksandbox/staging'
PROJECT_NAME = 'sandbox-235500'
# Set the base image name
BASE_IMAGE='gcr.io/%s/automltablesbase:dev' % PROJECT_NAME
TARGET_IMAGE='gcr.io/%s/automltablescreate:dev' % PROJECT_NAME

Jupyter docker magic is used to start a Kaniko job. The magic uses a default Kubernetes config.

In [None]:
%%docker {BASE_IMAGE} {STAGING_DIR}
FROM tensorflow/tensorflow:latest-py3
RUN pip3 install --upgrade pandas
RUN pip3 install --upgrade google-cloud-storage
RUN pip3 install --upgrade google-cloud-automl
RUN pip3 install --upgrade google-cloud-bigquery

### Create python lightweight components

# Load data to Big Query

In [3]:
from typing import NamedTuple

@dsl.python_component(
    name='automml_create_dataset',
    description='AutoML create dataset',
    base_image=BASE_IMAGE
)
def automl_import_dataset(
    project_id: str,
    dataset_id: str,
    table_id: str) -> NamedTuple('DatasetInfo', 
                               [('name', str), 
                                ('total_rows', int),
                                ('total_columns', int)]):
    
    
    print("Project ID:", project_id)
    print("Dataset ID:", dataset_id)
    print("Table ID:", table_id)
    
    from collections import namedtuple
    result = namedtuple('DatasetInfo', ['name', 'total_rows', 'total_colums'])
    return result('test', 1000, 10)
    

In [4]:
QUERY = 'SELECT * FROM `bigquery-public-data.stackoverflow.posts_questions` LIMIT 10'


In [8]:
AutomlCreateDatasetOp = kfp.components.func_to_container_op(automl_create_dataset, output_component_file='comp.yaml')

In [6]:
# The return value "DeployerOp" represents a step that can be used directly in a pipeline function
#AutomlCreateDatasetOp = kfp.compiler.build_python_component(
#    component_func=automl_create_dataset,
#    staging_gcs_path=STAGING_DIR,
#    target_image=TARGET_IMAGE)

2019-04-29 22:05:16:INFO:Build an image that is based on gcr.io/sandbox-235500/automltablesbase:dev and push the image to gcr.io/sandbox-235500/automltablescreate:dev
2019-04-29 22:05:16:INFO:Checking path: gs://jksandbox/staging...
2019-04-29 22:05:16:INFO:Generate entrypoint and serialization codes.


Exception: Output type not supported and supported types are [int, float, str, bool]

In [None]:
BqQueryOp = kfp.components.load_component_from_url(
    'https://raw.githubusercontent.com/kubeflow/pipelines/e8524eefb138725fc06600d1956da0f4dd477178/components/gcp/bigquery/query/component.yaml')


## Create and run a pipeline

### Define a pipeline

In [None]:
from google.cloud import bigquery
import json

@dsl.pipeline(
    name='CLVPipeline',
    description='CLV Pipeline'
)
def clv_pipeline(
    project_id='', 
    dataset_id='', 
    table_id='', 
    clean_data_query=clean_data_query,
    engineer_features_query=engineer_features_query,
    target='',
    feature_list='',
    dataset_location='US'
):
    
    BqQueryOp = kfp.components.load_component_from_url(
        'https://raw.githubusercontent.com/kubeflow/pipelines/e8524eefb138725fc06600d1956da0f4dd477178/components/gcp/bigquery/query/component.yaml')

        
    clean_data_op = BqQueryOp(
        query=clean_data_query, 
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id, 
        output_gcs_path='', 
        dataset_location=dataset_location, 
        job_config='').apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    engineer_features_op = BqQueryOp(
        query=clean_data_query, 
        project_id=project_id, 
        dataset_id=dataset_id, 
        table_id=table_id, 
        output_gcs_path='', 
        dataset_location=dataset_location, 
        job_config='').apply(gcp.use_gcp_secret('user-gcp-sa'))
    
    engineer_features_op.after(clean_data_op)
    
    
    
    

### Compile a pipeline

In [None]:


pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [None]:
CLEAN_DATA_QUERY = '''
SELECT
  customer_id,
  order_date,
  order_value,
  order_qty_articles
FROM
(
  SELECT
    CustomerID AS customer_id,
    PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) AS order_date,
    ROUND(SUM(UnitPrice * Quantity), 2) AS order_value,
    SUM(Quantity) AS order_qty_articles,
    (
      SELECT
        MAX(PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)))
      FROM
        `sandbox-235500.CLVDataset.data_source` tl
      WHERE
        tl.CustomerID = t.CustomerID
    ) latest_order
  FROM
    `sandbox-235500.CLVDataset.data_source` t
  GROUP BY
      CustomerID,
      order_date
) a

INNER JOIN (
  -- Only customers with more than one positive order values before threshold.
  SELECT
    CustomerID
  FROM (
    -- Customers and how many positive order values  before threshold.
    SELECT
      CustomerID,
      SUM(positive_value) cnt_positive_value
    FROM (
      -- Customer with whether order was positive or not at each date.
      SELECT
        CustomerID,
        (
          CASE
            WHEN SUM(UnitPrice * Quantity) > 0 THEN 1
            ELSE 0
          END ) positive_value
      FROM
        `sandbox-235500.CLVDataset.data_source`
      WHERE
        PARSE_DATE("%m/%d/%y", SUBSTR(InvoiceDate, 0, 8)) < DATE("<<threshold_date>>")
      GROUP BY
        CustomerID,
        SUBSTR(InvoiceDate, 0, 8) )
    GROUP BY
      CustomerID )
  WHERE
    cnt_positive_value > 1
  ) b
ON
  a.customer_id = b. CustomerID
--[START common_clean]
WHERE
  -- Bought in the past 3 months
  DATE_DIFF(DATE("<<predict_date>>"), latest_order, DAY) <= 90
  -- Make sure returns are consistent.
  AND (
    (order_qty_articles > 0 and order_Value > 0) OR
    (order_qty_articles < 0 and order_Value < 0)
  )
'''


In [None]:
ENGINEER_FEATURES_QUERY = '''
SELECT
'''



In [None]:
#Specify pipeline argument values
DATASET_ID = 'mykfpdataset'

arguments = {
    'project_id': PROJECT_NAME,
    'dataset_id': DATASET_ID
}


HOST = 'http://localhost:8082/api/v1/namespaces/kubeflow/services/ml-pipeline:8888/proxy'
EXPERIMENT_NAME = 'CLV'

client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)