# Orchestrating BigQuery and AutoML Tables using Kubeflow Pipelines.
This notebook demonstrates how to implement and execute a Kubeflow pipline that uses BigQuery for data pre-processing/feature engineering and AutoML Tables for model training. 

## Defining a lightweight Python component to parametrize BigQuery SQL query.
The pipeline utilizes the [**Submitting a query using BigQuery**](https://aihub.cloud.google.com/p/products%2F4700cd7e-2826-4ce9-a1ad-33f4a5bf7433) component from **AI Hub**. The component requires a BigQuery SQL query as one of its inputs. To avoid hardcoding table names and other parameters of the query, a KFP light-weight Python component is defined. The component dynamically generates a SQL query by substituting placeholders in a query template using values passed as input parameters.


In [8]:
BASE_IMAGE = 'mirror.gcr.io/google/python'
@kfp.dsl.python_component(name='Prepare feature engineering query', base_image=BASE_IMAGE,target_component_file='clean_op.yaml')
def prepare_feature_engineering_query(
    project_id: str,
    source_table_id: str,
    threshold_date: str,
    predict_end: str,
    max_monetary: str,
    query_template_uri: str) -> str:
    """Creates a feature engineering query"""

    from google.cloud import storage
    import re
    
    # Read a query template from GCS
    _, bucket, blob_name = re.split("gs://|/", query_template_uri, 2)
    blob = storage.Client(project_id).get_bucket(bucket).blob(blob_name)
    query_template = blob.download_as_string().decode('utf-8')

    # Substitute placeholders in the query template
    query = query_template.format(
        data_source_id=source_table_id,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary
    )
    
    return query

## Building the pipeline

In [10]:
import kfp

QUERY_TEMPLATE_URI = 'gs://sandbox-235500/sql-templates/create_features_template.sql'
BIGQUERY_COMPONENT_SPEC_URI = 'https://raw.githubusercontent.com/kubeflow/pipelines/3b938d664de35db9401c6d198439394a9fca95fa/components/gcp/bigquery/query/component.yaml'
AML_IMPORT_DATASET_SPEC_URI = 'https://raw.githubusercontent.com/jarokaz/CLVPipelines/master/components/automl_tables/aml-import-dataset.yaml'
AML_TRAIN_MODEL_SPEC_URI = 'https://raw.githubusercontent.com/jarokaz/CLVPipelines/master/components/automl_tables/aml-train-model.yaml'

@kfp.dsl.pipeline(
    name='CLVTrainingPipeline',
    description='CLV Training Pipeline'
)
def clv_pipeline(
    project_id='', 
    source_table_id='',
    features_dataset_id='', 
    features_table_id='',
    features_dataset_location='US',
    threshold_date='',
    predict_end='',
    max_monetary=15000,
    automl_compute_region='us-central1',
    automl_dataset_name='clv_features',
    model_name='clv_regression',
    train_budget='1000',
    target_name='target_monetary',
    features_to_exclude='monetary'
):
    # Create component factories
    prepare_feature_engineering_query_op = kfp.components.func_to_container_op(prepare_feature_engineering_query)
    engineer_features_op = kfp.components.load_component_from_url(BIGQUERY_COMPONENT_SPEC_URI)
    import_dataset_op = kfp.components.load_component_from_url(AML_IMPORT_DATASET_SPEC_URI)
    train_model_op = kfp.components.load_component_from_url(AML_TRAIN_MODEL_SPEC_URI)

    # Define the training pipeline
    prepare_feature_engineering_query_task = prepare_feature_engineering_query_op(
        project_id=project_id,
        source_table_id=source_table_id,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary,
        query_template_uri=QUERY_TEMPLATE_URI
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))

    engineer_features_task = engineer_features_op(
        query=prepare_feature_engineering_query_task.output,
        project_id=project_id,
        dataset_id=features_dataset_id,
        table_id=features_table_id,
        output_gcs_path='',
        dataset_location=features_dataset_location,
        job_config=''
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))

    import_dataset_task = import_dataset_op(
        project_id=project_id,
        location=automl_compute_region,
        dataset_name=automl_dataset_name,
        source_data_uri='bq://{}.{}.{}'.format(project_id, features_dataset_id, features_table_id),
        target_column_name=target_column_name,
        weight_column_name='',
        ml_use_column_name=''       
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))
    import_dataset_task.after(engineer_features_task)

    train_model_task = train_model_op(
        project_id=project_id,
        location=automl_compute_region,
        dataset_id=import_dataset_task.outputs['output_dataset_id'],
        model_name='test_model',
        train_budget=1000,
        optimization_objective='MINIMIZE_MAE',
        target_name='target_monetary',
        features_to_exclude='customer_id'
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))   

pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename) 


TypeError: Argument "InputValuePlaceholder(input_name='target_column_name')" references non-existing input.