# Orchestrating AutoML Tables training workflow

In [4]:
import kfp


## Prepare lab environment


### Create a BigQuery dataset

In [26]:
_project_id = 'jk-caip'
_dataset_name = 'lab_301'

!bq --location=US --project_id=$_project_id mk --dataset $_dataset_name

BigQuery error in mk operation: Dataset 'jk-caip:lab_301' already exists.


### Load sale transactions data to BigQuery

In [75]:
_source_file = '../datasets/clv/transactions.csv'
_schema = 'customer_id:STRING,order_date:DATE,quantity:INTEGER,unit_price:FLOAT'
_table_name = 'transactions'

# Load data
!bq --project_id=$_project_id --dataset_id=$_dataset_name load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$_table_name \
../datasets/clv/transactions.csv \
$_schema

Upload complete.
Waiting on bqjob_r7200178f755c53d1_0000016e48677673_1 ... (3s) Current status: DONE   


## Create the KFP training pipeline

### Create component factories for the pre-defined GCP components

In [10]:
_url_search_prefix = 'https://raw.githubusercontent.com/kubeflow/pipelines/0.1.33/components/gcp/'

component_store = kfp.components.ComponentStore(
    local_search_paths=None,
    url_search_prefixes=[_url_search_prefix])
    
automl_create_dataset_op = component_store.load_component('automl/create_dataset_for_tables')
automl_import_data_from_bq_op = component_store.load_component('automl/import_data_from_bigquery')
automl_create_model__op = component_store.load_component('automl/create_model_for_tables')
automl_split_dataset_table_column_names_op = component_store.load_component('automl/split_dataset_table_column_names')
bq_query_op = component_store.load_component('bigquery/query')

### Create custom components

### Define a feature engineering query

In [76]:
_query_template = '''
WITH
  order_summaries as (
    SELECT
      a.customer_id,
      a.order_date,
      a.order_value,
      a.order_qty_articles
    FROM
    (
      SELECT
        customer_id,
        order_date,
        ROUND(SUM(unit_price * quantity), 2) AS order_value,
        SUM(quantity) AS order_qty_articles,
        (
          SELECT
            MAX(order_date)
          FROM
            `{{ data_source_id }}` tl
          WHERE
            tl.customer_id = t.customer_id
        ) latest_order
      FROM
        `{{ data_source_id }}` t
      GROUP BY
          customer_id,
          order_date
    ) a

    INNER JOIN (
      -- Only customers with more than one positive order values before threshold.
      SELECT
        customer_id
      FROM (
        -- Customers and how many positive order values  before threshold.
        SELECT
          customer_id,
          SUM(positive_value) cnt_positive_value
        FROM (
          -- Customer with whether order was positive or not at each date.
          SELECT
            customer_id,
            (
              CASE
                WHEN SUM(unit_price * quantity) > 0 THEN 1
                ELSE 0
              END ) positive_value
          FROM
            `{{ data_source_id }}`
          WHERE
            order_date < DATE("{{ threshold_date }}")
          GROUP BY
            customer_id,
            order_date)
        GROUP BY
          customer_id )
      WHERE
        cnt_positive_value > 1
      ) b
    ON
      a.customer_id = b.customer_id
    --[START common_clean]
    WHERE
      -- Bought in the past 3 months
      DATE_DIFF(DATE("{{ predict_end }}"), latest_order, DAY) <= 90
      -- Make sure returns are consistent.
      AND (
        (order_qty_articles > 0 and order_Value > 0) OR
        (order_qty_articles < 0 and order_Value < 0)
      ))
          
SELECT
  tf.customer_id,
  -- For training period
  -- Copying the calculations from Lifetimes where first orders are ignored
  -- See https://github.com/CamDavidsonPilon/lifetimes/blob/master/lifetimes/utils.py#L246
--[START features_target]
  ROUND(tf.monetary, 2) as monetary,
  tf.cnt_orders AS frequency,
  tf.recency,
  tf.T,
  ROUND(tf.recency/cnt_orders, 2) AS time_between,
  ROUND(tf.avg_basket_value, 2) AS avg_basket_value,
  ROUND(tf.avg_basket_size, 2) AS avg_basket_size,
  tf.cnt_returns,
  -- Target calculated for overall period
  ROUND(tt.target_monetary, 2) as target_monetary
--[END features_target]
FROM
  -- This SELECT uses only data before threshold to make features.
  (
    SELECT
      customer_id,
      SUM(order_value) AS monetary,
      DATE_DIFF(MAX(order_date), MIN(order_date), DAY) AS recency,
      DATE_DIFF(DATE('{{ threshold_date }}'), MIN(order_date), DAY) AS T,
      COUNT(DISTINCT order_date) AS cnt_orders,
      AVG(order_qty_articles) avg_basket_size,
      AVG(order_value) avg_basket_value,
      SUM(CASE
          WHEN order_value < 1 THEN 1
          ELSE 0 END) AS cnt_returns
    FROM
      order_summaries a
    WHERE
      order_date <= DATE('{{ threshold_date }}')
    GROUP BY
      customer_id) tf,

  -- This SELECT uses data after threshold to calculate the target )
  (
    SELECT
      customer_id,
      SUM(order_value) target_monetary
    FROM
      order_summaries
      WHERE order_date > DATE('{{ threshold_date }}')
    GROUP BY
      customer_id) tt
WHERE
  tf.customer_id = tt.customer_id
  AND tf.monetary > 0
  AND tf.monetary <= {{ max_monetary }}
'''

In [77]:
from jinja2 import Template

_query = Template(_query_template).render(
    data_source_id='{}.{}.{}'.format(_project_id, _dataset_name, _table_name),
    threshold_date='2011-08-08',
    predict_end='2011-12-12',
    max_monetary=15000)

### Define the pipeline function

In [79]:
_features_table_name = 'features'

_aml_compute_region = 'us-central1'
_aml_dataset_name = 'clv_features'
_aml_model_name = 'clv_regression'
_target_column_name = 'target_monetary'
_train_budget = 1000
_optimization_objective = 'MINIMIZE_MAE'
_primary_metric = 'mean_absolute_error'
_deployment_threshold = 900

@kfp.dsl.pipeline(
    name='CLV Training',
    description='CLV Training Pipeline using BigQuery for feature engineering and Automl Tables for model training'
)
def clv_train(
    project_id: str = _project_id,
    feature_engineering_query: str = _query,
    aml_compute_region: str = _aml_compute_region,
    features_table_name: str = _features_table_name,
    features_dataset: str = _dataset_name,
    aml_dataset_name: str = _aml_dataset_name,
    target_column_name: str = _target_column_name,
    aml_model_name: str = _aml_model_name,
    train_budget: 'Integer' = _train_budget,
    primary_metric: str = _primary_metric,
    deployment_threshold: 'Float' = _deployment_threshold
    ):
    
    engineer_features = bq_query_op(
        feature_engineering_query,
        project_id,
        features_dataset,
        features_table_name)
    
    from kfp.gcp import use_gcp_secret
    kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))


### Compile the pipeline

In [80]:
_pipeline_yaml = 'clv_training.yaml'
kfp.compiler.Compiler().compile(clv_train, _pipeline_yaml)

### Upload the pipeline to KFP
Get GKE cluster credentials.

In [57]:
_cluster_name = 'mlops-cluster'
_cluster_zone = 'us-central1-a'

!gcloud config set project $_project_id
!gcloud container clusters get-credentials $_cluster_name --zone $_cluster_zone

Updated property [core/project].
Fetching cluster endpoint and auth data.
kubeconfig entry generated for mlops-cluster.


Use `kfp.Client()` to upload the pipeline.

In [82]:
_pipeline_name = 'clv_training_pipeline'
_client = kfp.Client()

_pipeline_ref = _client.upload_pipeline(_pipeline_yaml, _pipeline_name)

### Trigger a pipeline run

In [90]:
_experiment_name = 'CLV Training'
_run_name = 'Run 01'
_params = dict()

try:
    _experiment_ref = _client.get_experiment(_experiment_name)
except:
    _experiment_ref = _client.create_experiment(_experiment_name)

_client.run_pipeline(
    _experiment_ref.id,
    _run_name,
    pipeline_package_path=None,
    params=_params,
    pipeline_id = _pipeline_ref.id)

{'created_at': datetime.datetime(2019, 11, 8, 0, 43, 42, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': datetime.datetime(1970, 1, 1, 0, 0, tzinfo=tzlocal()),
 'id': '39ac1845-de0d-4133-b833-db6cf10fdfef',
 'metrics': None,
 'name': 'Run 01',
 'pipeline_spec': {'parameters': None,
                   'pipeline_id': '19d35fe8-4db2-4b4a-b2b1-2c047bf23e6f',
                   'pipeline_manifest': None,
                   'workflow_manifest': '{"kind":"Workflow","apiVersion":"argoproj.io/v1alpha1","metadata":{"generateName":"clv-training-","creationTimestamp":null,"annotations":{"pipelines.kubeflow.org/pipeline_spec":"{\\"description\\": '
                                        '\\"CLV Training Pipeline using '
                                        'BigQuery for feature engineering and '
                                        'Automl Tables for model training\\", '
                                        '\\"inputs\\": [{\\"default\\": '
                       

## Clean up

In [23]:
!bq --project_id=$PROJECT_ID rm -r -f -d $DATASET_NAME