### What we will learn

- We will build Kubeflow Pipeline that 
 -- creates Dataset from BigQuery TABLE/VIEW
 -- runs AutoML training on Tabular Dataset
 -- created Vertex AI endpoint
 -- deploys model to Vertex AI endpoint

We will use Vertex AI Kubeflow components

In [1]:
import kfp

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env', verbose=True)

True

In [3]:
import os


BIGQUERY_PROJECT_ID = os.environ.get('BIGQUERY_PROJECT_ID')
BIGQUERY_DATASET = os.environ.get('BIGQUERY_DATASET')
BIGQUERY_DATASET_REGION = os.environ.get('BIGQUERY_DATASET_REGION')
BIGQUERY_TABLE = os.environ.get('BIGQUERY_TABLE')

VERTEXAI_PROJECT_ID = os.environ.get('VERTEXAI_PROJECT_ID')
VERTEXAI_REGION = os.environ.get('VERTEXAI_REGION')

BUCKET_NAME = os.environ.get('BUCKET_NAME')
BUCKET_URI = os.environ.get('BUCKET_URI')
BUCKET_REGION = os.environ.get('BUCKET_REGION')

PREFIX = os.environ.get('PREFIX')


print("BIGQUERY_PROJECT_ID: ",BIGQUERY_PROJECT_ID)
print("BIGQUERY_DATASET: ",BIGQUERY_DATASET)
print("BIGQUERY_DATASET_REGION: ",BIGQUERY_DATASET_REGION)
print("BIGQUERY_TABLE: ",BIGQUERY_TABLE)

print("VERTEXAI_PROJECT_ID: ",VERTEXAI_PROJECT_ID)
print("VERTEXAI_REGION: ",VERTEXAI_REGION)

print("BUCKET_NAME: ",BUCKET_NAME)
print("BUCKET_URI: ",BUCKET_NAME)
print("BUCKET_REGION: ",VERTEXAI_REGION)


PIPELINE_ROOT = 'gs://{}/pipeline_root'.format(BUCKET_NAME)

BIGQUERY_PROJECT_ID:  datafusionsbox
BIGQUERY_DATASET:  dataset4ccc
BIGQUERY_DATASET_REGION:  us
BIGQUERY_TABLE:  df_for_model_ccc_with_weights
VERTEXAI_PROJECT_ID:  datafusionsbox
VERTEXAI_REGION:  us-central1
BUCKET_NAME:  gcp-demo-ccc-vertexai
BUCKET_URI:  gcp-demo-ccc-vertexai
BUCKET_REGION:  us-central1


In [4]:
PIPELINE_PARAMETERS = {
    "in_prefix": PREFIX,
    "in_bigquery_projectid": BIGQUERY_PROJECT_ID,
    "in_bigquery_dataset": BIGQUERY_DATASET,
    "in_bigquery_table": BIGQUERY_TABLE,
    "in_vertexai_projectid": VERTEXAI_PROJECT_ID,
    "in_vertexai_region": VERTEXAI_REGION 
    }

LABELS = {}

ENABLE_CACHING=True

https://google-cloud-pipeline-components.readthedocs.io/en/google-cloud-pipeline-components-0.1.6/google_cloud_pipeline_components.aiplatform.html

https://cloud.google.com/vertex-ai/docs/pipelines/vertex-automl-component

![image.png](attachment:ecaab165-b906-458f-8f81-0cdd7c75c7cb.png)

In [5]:
from kfp.dsl import pipeline
from google_cloud_pipeline_components import aiplatform as gcpc

@pipeline(name="wf-ccc-ex2-automl-dataset")
def pipeline(
    in_prefix: str = "demo",
    in_bigquery_projectid: str = 'projektccc',
    in_bigquery_dataset: str = 'crm_cookie_match', 
    in_bigquery_table: str = 'crm_cookie_match',
    in_vertexai_projectid: str = 'defaultprojectid',
    in_vertexai_region: str = 'us-central1'
):
    create_dataset = gcpc.TabularDatasetCreateOp(
        project=in_vertexai_projectid,
        display_name="ccc-dataset",
        bq_source=f"bq://{in_bigquery_projectid}.{in_bigquery_dataset}.{in_bigquery_table}"
    )
    
    ### Runs the training job and returns a model
    start_training = gcpc.AutoMLTabularTrainingJobRunOp(
        project=in_vertexai_projectid,
        dataset=create_dataset.outputs["dataset"],
        display_name=f"train-automl-{in_prefix}",
        optimization_prediction_type="classification",
        target_column="y_if_trans",
        optimization_objective="maximize-au-roc",
        model_display_name=f"promo-classification-automl-{in_prefix}",
        training_fraction_split=0.6,
        validation_fraction_split=0.2,
        test_fraction_split=0.2,
        budget_milli_node_hours=2000,
        weight_column="weight"
    )
    
    create_endpoint = gcpc.EndpointCreateOp(
        project=in_vertexai_projectid,
        display_name = f"promo-classification-{in_prefix}-endpoint",
    )
    
    deploy_model_2_endpoint = gcpc.ModelDeployOp(
        model=start_training.outputs["model"],
        endpoint=create_endpoint.outputs['endpoint'],
        dedicated_resources_min_replica_count=1,
        dedicated_resources_max_replica_count=2,
        traffic_split={"0":"100"},
    )

In [6]:
from kfp import compiler
dag_yaml_filename = "dag_ccc_ex2.yaml"

compiler.Compiler().compile(
   pipeline_func=pipeline,        ##input
   package_path=dag_yaml_filename ##output
)

In [None]:
from google.cloud import aiplatform

job = aiplatform.PipelineJob(display_name = "kfp_pipeline_ccc_ex2",
                             template_path = dag_yaml_filename,
                             pipeline_root = PIPELINE_ROOT,
                             parameter_values = PIPELINE_PARAMETERS, ## Make sure PIPELINE_PARAMETERS collection does not include parameters that are unknown to pipeline
                             enable_caching = ENABLE_CACHING,
                             #encryption_spec_key_name = CMEK,
                             labels = LABELS,
                             project = VERTEXAI_PROJECT_ID,
                             location = VERTEXAI_REGION)

job.run(service_account="339239659794-compute@developer.gserviceaccount.com")

Creating PipelineJob
PipelineJob created. Resource name: projects/339239659794/locations/us-central1/pipelineJobs/wf-ccc-ex2-automl-dataset-20230213112830
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/339239659794/locations/us-central1/pipelineJobs/wf-ccc-ex2-automl-dataset-20230213112830')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/wf-ccc-ex2-automl-dataset-20230213112830?project=339239659794
PipelineJob projects/339239659794/locations/us-central1/pipelineJobs/wf-ccc-ex2-automl-dataset-20230213112830 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/339239659794/locations/us-central1/pipelineJobs/wf-ccc-ex2-automl-dataset-20230213112830 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/339239659794/locations/us-central1/pipelineJobs/wf-ccc-ex2-automl-dataset-20230213112830 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob