In [9]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.notebook

In [10]:
PROJECT_NAME = 'sandbox-235500'
BASE_IMAGE = 'gcr.io/{}/automltablesbase:dev'.format(PROJECT_NAME)
STAGING_GCS_PATH = 'gs://{}/staging'.format(PROJECT_NAME)

In [53]:
from google.cloud import storage

blob = storage.Client(PROJECT_NAME).get_bucket('sandbox-235500').blob('clv-sql-templates/creandbox-235500/clv-sql-templates/create_features.sql')


query_template = blob.download_as_string()

query_template

b'WITH\n  order_summaries as (\n    SELECT\n      a.customer_id,\n      a.order_date,\n      a.order_value,\n      a.order_qty_articles\n    FROM\n    (\n      SELECT\n        customer_id,\n        order_date,\n        ROUND(SUM(unit_price * quantity), 2) AS order_value,\n        SUM(quantity) AS order_qty_articles,\n        (\n          SELECT\n            MAX(order_date)\n          FROM\n            `{transactions_table_id}` tl\n          WHERE\n            tl.customer_id = t.customer_id\n        ) latest_order\n      FROM\n        `{transactions_table_id}` t\n      GROUP BY\n          customer_id,\n          order_date\n    ) a\n\n    INNER JOIN (\n      -- Only customers with more than one positive order values before threshold.\n      SELECT\n        customer_id\n      FROM (\n        -- Customers and how many positive order values  before threshold.\n        SELECT\n          customer_id,\n          SUM(positive_value) cnt_positive_value\n        FROM (\n          -- Customer wit

In [36]:


@kfp.dsl.python_component(name='Data preparation', base_image=BASE_IMAGE)
def prepare_features(
    project_id: str,
    data_source_id: str,
    threshold_date: str,
    predict_end: str,
    max_monetary: str,
    dest_dataset_id: str,
    dest_table_id: str,
    query_template_uri: str 
    ) -> str:
    
    import logging
    from google.cloud import bigquery
    from google.cloud import storage
    
    # Load query template
    blob = storage.Client(project_id).get_bucket('sandbox-235500').blob('clv-sql-templates/creandbox-235500/clv-sql-templates/create_features.sql')

    
    return

    client = bigquery.Client(project=project_id)
    
    # If table_id not passed create a unique table name
    if not table_id:
        table_id = 'output_{}'.format(uuid.uuid4().hex)
        
    # Configure BQ to write an output to a destination table
    job_config = bigquery.QueryJobConfig(
        destination=client.dataset(dataset_id).table(table_id),
        create_disposition=bigquery.job.CreateDisposition.CREATE_IF_NEEDED,
        write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
        
    # Execute the query and wait for the query to finish
    client.query(query, job_config).result()
    
    # Return the full ID of the destination table
    return "{}.{}.{}".format(
        project_id, 
        dataset_id, 
        table_id)

    


In [37]:
project_id = 'sandbox-235500'
data_source_id = 'sandbox-235500.CLVDataset.transactions'
dest_dataset_id = 'CLVDataset'
dest_table_id = 'test'
threshold_date = '2011-08-08'
predict_end = '2011-12-12'
max_monetary = '15000'
query_template_uri = 'gs://sandbox-235500/clv-sql-templates/create_features.sql'

prepare_features_result = prepare_features(
        project_id=project_id,
        data_source_id=data_source_id,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary,
        dest_dataset_id=dest_dataset_id,
        dest_table_id=dest_table_id,
        query_template_uri=query_template_uri
        )

prepare_features_result

gs://sandbox-235500/clv-sql-templates/create_features.sql


In [None]:
compiler.build_python_component(
    component_func=prepare_features,
    staging_gcs_path=STAGING_GCS_PATH,
    target_component_file='component-prepare-features.yaml',
    target_image='gcr.io/{}/component-prepare-features:latest'.format(PROJECT_NAME))

In [33]:
QUERY_TEMPLATE_URI = 'gs://sandbox-235500/clv-sql-templates/create_features.sql'

@dsl.pipeline(
    name='CLVTrainingPipeline',
    description='CLV Training Pipeline'
)
def clv_pipeline(
    project_id='', 
    data_source_id='',
    dest_dataset_id='', 
    dest_table_id='',
    threshold_date='',
    predict_end='',
    max_monetary=15000,
    automl_dataset_location='us-central1'
):


    prepare_features_op = kfp.components.load_component('component-prepare-features.yaml')
    
    prepare_features_task = prepare_features_op(
        project_id=project_id,
        data_source_id=data_source_id,
        threshold_date=threshold_date,
        predict_end=predict_end,
        max_monetary=max_monetary,
        dest_dataset_id=dest_dataset_id,
        dest_table_id=dest_table_id,
        query_template_uri=QUERY_TEMPLATE_URI
        )
    
    

pipeline_func = clv_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename) 

In [34]:
#Specify pipeline argument values

query = """
    SELECT name, SUM(number) as total
    FROM `bigquery-public-data.usa_names.usa_1910_current`
    GROUP BY name
    ORDER BY total DESC
    LIMIT 10
"""

arguments = {
    'project_id': 'sandbox-235500',
    'data_source_id': 'sandbox-235500.CLVDataset.transactions',
    'dest_dataset_id': 'CLVDataset',
    'dest_table_id': 'test',
    'threshold_date': '2011-08-08',
    'predict_end': '2011-12-12',
    'max_monetary': '15000'
}


HOST = 'http://localhost:8082/api/v1/namespaces/kubeflow/services/ml-pipeline:8888/proxy'
EXPERIMENT_NAME = 'TEST_EXP'

client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)

{'created_at': datetime.datetime(2019, 5, 10, 23, 12, 23, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': None,
 'id': '11d800f1-7379-11e9-bf64-42010a800073',
 'metrics': None,
 'name': 'clv_pipeline run',
 'pipeline_spec': {'parameters': [{'name': 'dest-dataset-id',
                                   'value': 'CLVDataset'},
                                  {'name': 'project-id',
                                   'value': 'sandbox-235500'},
                                  {'name': 'max-monetary', 'value': '15000'},
                                  {'name': 'threshold-date',
                                   'value': '2011-08-08'},
                                  {'name': 'predict-end',
                                   'value': '2011-12-12'},
                                  {'name': 'data-source-id',
                                   'value': 'sandbox-235500.CLVDataset.transactions'},
                                  {'name': 'dest-table-id', 'valu