In [4]:
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.gcp as gcp

EXPERIMENT_NAME = 'CLV_TRAIN_DATAPROC'
CREATE_DATAPROC_SPEC_URI = 'https://raw.githubusercontent.com/kubeflow/pipelines/d2f5cc92a46012b9927209e2aaccab70961582dc/components/gcp/dataproc/create_cluster/component.yaml'
DELETE_DATAPROC_SPEC_URI = 'https://raw.githubusercontent.com/kubeflow/pipelines/d2f5cc92a46012b9927209e2aaccab70961582dc/components/gcp/dataproc/delete_cluster/component.yaml' 
SUBMIT_PYSPARK_JOB_SPEC_URI = 'https://raw.githubusercontent.com/kubeflow/pipelines/d2f5cc92a46012b9927209e2aaccab70961582dc/components/gcp/dataproc/submit_pyspark_job/component.yaml'
AML_IMPORT_DATASET_SPEC_URI = '/home/jupyter/projects/clv_kfp/components/automl_tables/aml-import-dataset.yaml'
AML_TRAIN_MODEL_SPEC_URI = '/home/jupyter/projects/clv_kfp/components/automl_tables/aml-train-model.yaml'
CREATE_FEATURES_FILE_URI = 'gs://sandbox-235500/pyspark-scripts/create_features.py'
HOST = 'http://localhost:8082'

@dsl.pipeline(
    name='CLVTrainingPipelineDataproc',
    description='CLV Training Pipeline using Dataproc/Spark for data preparation'
)
def clv_dataproc_pipeline(
    project_id='', 
    region='us-central1',
    source_gcs_path='',
    output_gcs_path='',
    threshold_date='',
    predict_end='',
    max_monetary=15000,
    max_partitions=2):

    dataproc_create_cluster_op = kfp.components.load_component_from_url(CREATE_DATAPROC_SPEC_URI)    
    dataproc_delete_cluster_op = kfp.components.load_component_from_url(DELETE_DATAPROC_SPEC_URI)    
    dataproc_submit_pyspark_job_op = kfp.components.load_component_from_url(SUBMIT_PYSPARK_JOB_SPEC_URI)    

    args = ('['
        '"--source-gcs-path={}",'
        '"--output-gcs-path={}",'
        '"--threshold-date={}",'
        '"--predict-end={}",'
        '"--max-monetary={}",'
        '"--max-partitions={}",'
        ']'
    ).format(
        source_gcs_path, 
        output_gcs_path,
        threshold_date,
        predict_end,
        max_monetary,
        max_partitions)

    dataproc_create_cluster_task = dataproc_create_cluster_op(
        project_id=project_id,
        region=region,
        name='',
        name_prefix='',
        initialization_actions='',
        config_bucket='',
        image_version='',
        cluster='',
        wait_interval='30'
    ) 

    dataproc_submit_pyspark_job_task = dataproc_submit_pyspark_job_op(
        project_id=project_id,
        region=region,
        cluster_name=dataproc_create_cluster_task.output,
        main_python_file_uri = CREATE_FEATURES_FILE_URI,
        args=args,
        pyspark_job='{}',
        job='{}',
        wait_interval='30'
    )

    dataproc_delete_cluster_task = dataproc_delete_cluster_op(
        project_id=project_id,
        region=region,
        name=dataproc_create_cluster_task.output
    )

    dataproc_delete_cluster_task.after(dataproc_submit_pyspark_job_task)
    
    import_dataset_task = import_dataset_op(
        project_id=project_id,
        location=region,
        dataset_name=automl_dataset_name,
        source_data_uri='bq://{}.{}.{}'.format(project_id, features_dataset_id, features_table_id)
    )
    
   


pipeline_func = clv_dataproc_pipeline
pipeline_filename = pipeline_func.__name__ + '.tar.gz'

kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename) 




In [5]:
#Get or create and experiment
client = kfp.Client(HOST)
experiment = client.create_experiment(EXPERIMENT_NAME)

#Submit the pipeline for execution
arguments = {
    'project_id': 'sandbox-235500',
    'source_gcs_path': 'gs://sandbox-235500/clv_sales_transactions',
    'output_gcs_path': 'gs://sandbox-235500/clv_training_dataset',
    'threshold_date': '2011-08-08',
    'predict_end': '2011-12-12' 
}

run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)
print(run_result)


{'created_at': datetime.datetime(2019, 5, 19, 17, 34, 11, tzinfo=tzlocal()),
 'description': None,
 'error': None,
 'finished_at': datetime.datetime(1970, 1, 1, 0, 0, tzinfo=tzlocal()),
 'id': '501390db-7a5c-11e9-bf5c-42010a8a0028',
 'metrics': None,
 'name': 'clv_dataproc_pipeline run',
 'pipeline_spec': {'parameters': [{'name': 'project-id',
                                   'value': 'sandbox-235500'},
                                  {'name': 'threshold-date',
                                   'value': '2011-08-08'},
                                  {'name': 'predict-end',
                                   'value': '2011-12-12'},
                                  {'name': 'region', 'value': 'us-west2'},
                                  {'name': 'output-gcs-path',
                                   'value': 'gs://sandbox-235500/clv_training_dataset'},
                                  {'name': 'source-gcs-path',
                                   'value': 'gs://sandbox-235500/c