# Orchestrate BigQuery and AutoML tables with Kubeflow pipelines

In [6]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

from typing import NamedTuple

Jupyter docker magic is used to start a Kaniko job. The magic uses a default Kubernetes config.

## Create and run a pipeline

### Define a pipeline

In [9]:
from google.cloud import bigquery
import json

FEATURES_TABLE_ID = 'clv_features'
AUTOML_DATASET_NAME = 'CLVFeatures'

#data_preprocessing_op = kfp.components.load_component_from_url(
#        'https://raw.githubusercontent.com/jarokaz/CLVPipelines/master/components/prep_data/component.yaml')

data_preprocessing_op = kfp.components.load_component_from_file(
        '../components/prep_data/component.yaml')


@dsl.pipeline(
    name='CLVTrainingPipeline',
    description='CLV Training Pipeline'
)
def clv_pipeline(
    project_id='', 
    dataset_id='', 
    transactions_table_fqn='',
    summaries_table_name='',
    features_table_name='',
    threshold_date='',
    predict_end='',
    max_monetary=15000,
    automl_dataset_location='us-central1'
):


    data_preprocessing_task = data_preprocessing_op(
        project_id=project_id,
        dataset_id=dataset_id,
        transactions_table_fqn=transactions_table_fqn,
        summaries_table_name=summaries_table_name,
        features_table_name=features_table_name,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary
        )
  

    


### Compile a pipeline

In [10]:
pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

### Submit the pipeline for execution

In [11]:
#Specify pipeline argument values
arguments = {
    'project_id': 'sandbox-235500',
    'dataset_id': 'CLVDataset',
    'transactions_table_fqn': 'sandbox-235500.CLVDataset.transactions',
    'summaries_table_name': 'summaries',
    'features_table_name': 'features',
    'threshold_date': '2011-08-08',
    'predict_end': '2011-12-12',
    'max_monetary': '15000'
}


HOST = 'http://localhost:8082/api/v1/namespaces/kubeflow/services/ml-pipeline:8888/proxy'
EXPERIMENT_NAME = 'CLV_TRAINING'

client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)

{'created_at': datetime.datetime(2019, 5, 10, 5, 16, 27, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': None,
 'id': 'c350f5fd-72e2-11e9-bf64-42010a800073',
 'metrics': None,
 'name': 'clv_pipeline run',
 'pipeline_spec': {'parameters': [{'name': 'threshold-date',
                                   'value': '2011-08-08'},
                                  {'name': 'max-monetary', 'value': '15000'},
                                  {'name': 'transactions-table-fqn',
                                   'value': 'sandbox-235500.CLVDataset.transactions'},
                                  {'name': 'summaries-table-name',
                                   'value': 'summaries'},
                                  {'name': 'features-table-name',
                                   'value': 'features'},
                                  {'name': 'project-id',
                                   'value': 'sandbox-235500'},
                                  {'name': 'data