In [1]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Output, Metrics, component)
from kfp.v2 import compiler

  from kfp.v2 import dsl


In [2]:
@component(
    packages_to_install=["google-cloud-aiplatform", "gcsfs", "xgboost", "category_encoders",
                         "imblearn", "pandas", "google-cloud-storage", "numpy"]
)
def custom_training_job_component(
    max_depth: int,
    n_estimators: int,
    metrics: Output[Metrics]
):
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from xgboost import XGBRegressor
    from sklearn.svm import SVR
    from sklearn.metrics import mean_squared_error
    from google.cloud import storage
    from sklearn.pipeline import make_pipeline

    storage_client = storage.Client()
    bucket = storage_client.bucket("udemy-gcp-mlops")


    def load_data(filename):
        df = pd.read_csv(filename)
        return df


    def preprocess_data(df):
        df = df.rename(columns={'weathersit': 'weather',
                                'yr': 'year',
                                'mnth': 'month',
                                'hr': 'hour',
                                'hum': 'humidity',
                                'cnt': 'count'})
        df = df.drop(columns=['instant', 'dteday', 'year'])
        cols = ['season', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weather']
        for col in cols:
            df[col] = df[col].astype('category')
        df['count'] = np.log(df['count'])
        df_oh = df.copy()
        for col in cols:
            df_oh = one_hot_encoding(df_oh, col)
        X = df_oh.drop(columns=['atemp', 'windspeed', 'casual', 'registered', 'count'], axis=1)
        y = df_oh['count']
        return X, y


    def one_hot_encoding(data, column):
        data = pd.concat([data, pd.get_dummies(data[column], prefix=column, drop_first=True)], axis=1)
        data = data.drop([column], axis=1)
        return data


    def train_model(x_train, y_train, max_depth, n_estimators):
        model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)
        pipeline = make_pipeline(model)
        pipeline.fit(x_train, y_train)
        return pipeline

    filename = 'gs://udemy-gcp-mlops/bikeshare-model/hour.csv'
    df = load_data(filename)

    X, y = preprocess_data(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

    pipeline = train_model(X_train, y_train, max_depth, n_estimators)
    y_pred = pipeline.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    metrics.log_metric("RMSE", rmse)

  return component_factory.create_component_from_func(


In [4]:
@dsl.pipeline(name="regression-hyperparam-experiment")
def pipeline(
    max_depth: int,
    n_estimators: int
    ):
    custom_training_job_component(max_depth=max_depth, n_estimators=n_estimators)

compiler.Compiler().compile(pipeline_func=pipeline, package_path="regression-hyperparam-experiment.json")

In [5]:
from google.cloud import aiplatform

EXPERIMENT_NAME = "regression-hyperparameter-experiment"
PIPELINE_ROOT = "gs://udemy-gcp-mlops/regression-model-experiment"

runs = [
        {'max_depth': 5, 'n_estimators': 50},
        {'max_depth': 5, 'n_estimators': 100},
        {'max_depth': 5, 'n_estimators': 200},
        {'max_depth': 3, 'n_estimators': 50},
        {'max_depth': 3, 'n_estimators': 100},
        {'max_depth': 3, 'n_estimators': 200},
]

for i, run in enumerate(runs):

    job = aiplatform.PipelineJob(
        display_name=f"{EXPERIMENT_NAME}-{i}",
        template_path="regression-hyperparam-experiment.json",
        pipeline_root=PIPELINE_ROOT,
        parameter_values={
            **run,
        },
    )
    job.submit(experiment=EXPERIMENT_NAME)

Creating PipelineJob
PipelineJob created. Resource name: projects/936546808722/locations/us-central1/pipelineJobs/regression-hyperparam-experiment-20240909003517
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/936546808722/locations/us-central1/pipelineJobs/regression-hyperparam-experiment-20240909003517')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/regression-hyperparam-experiment-20240909003517?project=936546808722
Associating projects/936546808722/locations/us-central1/pipelineJobs/regression-hyperparam-experiment-20240909003517 to Experiment: regression-hyperparameter-experiment
Creating PipelineJob
PipelineJob created. Resource name: projects/936546808722/locations/us-central1/pipelineJobs/regression-hyperparam-experiment-20240909003527
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/936546808722/locations/us-central1/pipelineJobs/r