# Spark and XGboost Pipeline

In [2]:
import kfp
from kfp import components
from kfp import dsl
from kfp import gcp

## Load reusable components

In [3]:
confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0b07e456b1f319d8b7a7301274f55c00fda9f537/components/local/confusion_matrix/component.yaml')
roc_op =              components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0b07e456b1f319d8b7a7301274f55c00fda9f537/components/local/roc/component.yaml')


## Define cluster create and delete operation

In [4]:
def dataproc_create_cluster_op(
    project,
    region,
    staging,
    cluster_name='xgb-{{workflow.name}}'
):
    return dsl.ContainerOp(
        name='Dataproc - Create cluster',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-create-cluster:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--name', cluster_name,
            '--staging', staging,
        ],
        file_outputs={
            'output': '/output.txt',
        }
    )


def dataproc_delete_cluster_op(
    project,
    region,
    cluster_name='xgb-{{workflow.name}}'
):
    return dsl.ContainerOp(
        name='Dataproc - Delete cluster',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-delete-cluster:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--name', cluster_name,
        ],
        is_exit_handler=True
    )

## Define data analyze and transform operation

In [5]:
def dataproc_analyze_op(
    project,
    region,
    cluster_name,
    schema,
    train_data,
    output
):
    return dsl.ContainerOp(
        name='Dataproc - Analyze',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-analyze:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--cluster', cluster_name,
            '--schema', schema,
            '--train', train_data,
            '--output', output,
        ],
        file_outputs={
            'output': '/output.txt',
        }
    )


def dataproc_transform_op(
    project,
    region,
    cluster_name,
    train_data,
    eval_data,
    target,
    analysis,
    output
):
    return dsl.ContainerOp(
        name='Dataproc - Transform',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-transform:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--cluster', cluster_name,
            '--train', train_data,
            '--eval', eval_data,
            '--analysis', analysis,
            '--target', target,
            '--output', output,
        ],
        file_outputs={
            'train': '/output_train.txt',
            'eval': '/output_eval.txt',
        }
    )

## Define training and prediction operation

In [6]:
def dataproc_train_op(
    project,
    region,
    cluster_name,
    train_data,
    eval_data,
    target,
    analysis,
    workers,
    rounds,
    output,
    is_classification=True
):
    if is_classification:
        config='gs://ml-pipeline-playground/trainconfcla.json'
    else:
        config='gs://ml-pipeline-playground/trainconfreg.json'

    return dsl.ContainerOp(
        name='Dataproc - Train XGBoost model',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-train:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--cluster', cluster_name,
            '--train', train_data,
            '--eval', eval_data,
            '--analysis', analysis,
            '--target', target,
            '--package', 'gs://ml-pipeline-playground/xgboost4j-example-0.8-SNAPSHOT-jar-with-dependencies.jar',
            '--workers', workers,
            '--rounds', rounds,
            '--conf', config,
            '--output', output,
        ],
        file_outputs={
            'output': '/output.txt',
        }
    )


def dataproc_predict_op(
    project,
    region,
    cluster_name,
    data,
    model,
    target,
    analysis,
    output
):
    return dsl.ContainerOp(
        name='Dataproc - Predict with XGBoost model',
        image='gcr.io/ml-pipeline/ml-pipeline-dataproc-predict:fe639f41661d8e17fcda64ff8242127620b80ba0',
        arguments=[
            '--project', project,
            '--region', region,
            '--cluster', cluster_name,
            '--predict', data,
            '--analysis', analysis,
            '--target', target,
            '--package', 'gs://ml-pipeline-playground/xgboost4j-example-0.8-SNAPSHOT-jar-with-dependencies.jar',
            '--model', model,
            '--output', output,
        ],
        file_outputs={
            'output': '/output.txt',
        }
    )


## Define the training pipeline

In [7]:
@dsl.pipeline(
    name='XGBoost Trainer',
    description='A trainer that does end-to-end distributed training for XGBoost models.'
)
def xgb_train_pipeline(
    output,
    project,
    region='us-central1',
    train_data='gs://ml-pipeline-playground/sfpd/train.csv',
    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
    schema='gs://ml-pipeline-playground/sfpd/schema.json',
    target='resolution',
    rounds=200,
    workers=2,
    true_label='ACTION',
):
    output_template = str(output) + '/{{workflow.uid}}/{{pod.name}}/data'

    delete_cluster_op = dataproc_delete_cluster_op(
        project,
        region
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))

    with dsl.ExitHandler(exit_op=delete_cluster_op):
        create_cluster_op = dataproc_create_cluster_op(
            project,
            region,
            output
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        analyze_op = dataproc_analyze_op(
            project,
            region,
            create_cluster_op.output,
            schema,
            train_data,
            output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        transform_op = dataproc_transform_op(
            project,
            region,
            create_cluster_op.output,
            train_data,
            eval_data,
            target,
            analyze_op.output,
            output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        train_op = dataproc_train_op(
            project,
            region,
            create_cluster_op.output,
            transform_op.outputs['train'],
            transform_op.outputs['eval'],
            target,
            analyze_op.output,
            workers,
            rounds,
            output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        predict_op = dataproc_predict_op(
            project,
            region,
            create_cluster_op.output,
            transform_op.outputs['eval'],
            train_op.output,
            target,
            analyze_op.output,
            output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        confusion_matrix_task = confusion_matrix_op(
            predict_op.output,
            output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

        roc_task = roc_op(
            predictions_dir=predict_op.output,
            true_class=true_label,
            true_score_column=true_label,
            output_dir=output_template
        ).apply(gcp.use_gcp_secret('user-gcp-sa'))

## Compile pipeline

In [8]:
pipeline_func = xgb_train_pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

## Submit job

In [9]:
#Specify pipeline argument values
arguments = {'output': 'gs://kubeflow-trykube/kubeflow-pipeline/spark',
             'project': 'trykube-248403'}

#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()
experiment = client.create_experiment('Spark-and-XGBoost')

#Submit a pipeline run
run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)