### Installation
Install the packages required for executing this notebook.

## Some of the source codes are based on
https://towardsdatascience.com/how-to-set-up-custom-vertex-ai-pipelines-step-by-step-467487f81cad 

In [1]:
# Install the packages
! pip3 install --user --no-cache-dir --upgrade "kfp>2" "google-cloud-pipeline-components>2" \
                                        google-cloud-aiplatform

Collecting kfp>2
  Downloading kfp-2.9.0.tar.gz (595 kB)
     ---------------------------------------- 0.0/595.6 kB ? eta -:--:--
     -------------------------------------- 595.6/595.6 kB 7.1 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting kfp-pipeline-spec==0.4.0 (from kfp>2)
  Downloading kfp_pipeline_spec-0.4.0-py3-none-any.whl.metadata (301 bytes)
Collecting kfp-server-api<2.4.0,>=2.1.0 (from kfp>2)
  Downloading kfp_server_api-2.3.0.tar.gz (84 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


## Restart the kernel
Once you've installed the additional packages, you need to restart the notebook kernel so it can find the packages.

In [3]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

Check the versions of the packages you installed. The KFP SDK version should be >=1.6.

In [29]:
! python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
! pip3 freeze | grep aiplatform
! python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"

Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.
'grep' is not recognized as an internal or external command,
operable program or batch file.
Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


In [3]:
import kfp
import typing
from typing import Dict
from typing import NamedTuple
from kfp import dsl
from kfp.dsl import (Artifact,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        component, 
                        OutputPath, 
                        InputPath)
import google.cloud.aiplatform as aip
from google_cloud_pipeline_components.v1.model import ModelUploadOp
from google_cloud_pipeline_components.v1.endpoint import (EndpointCreateOp,ModelDeployOp)
from google_cloud_pipeline_components.types import artifact_types

#### Project and Pipeline Configurations

In [4]:
#The Google Cloud project that this pipeline runs in.
PROJECT_ID = "de2024-435209"
# The region that this pipeline runs in
REGION = "us-central1"
# Specify a Cloud Storage URI that your pipelines service account can access. The artifacts of your pipeline runs are stored within the pipeline root.
PIPELINE_ROOT = "gs://temp_assignment1_g4"   # e.g., gs://temp_de2024

#### Create Pipeline Components

We can create a component from Python functions (inline) and from a container. We will first try inline python functions. 
Refer to  https://www.kubeflow.org/docs/components/pipelines/v2/components/lightweight-python-components/ for more information.

#### Pipeline Component : Train and Test Split

In [5]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],
    base_image="python:3.10.7-slim"
)
def train_test_split(dataset: Input[Dataset], dataset_train: Output[Dataset], dataset_test: Output[Dataset]):
    '''train_test_split'''
    import pandas as pd
    import logging 
    import sys
    from sklearn.model_selection import train_test_split as tts
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
    alldata = pd.read_csv(dataset.path, index_col=None)
    train, test = tts(alldata, test_size=0.25)
    train.to_csv(dataset_train.path + ".csv" , index=False, encoding='utf-8-sig')
    test.to_csv(dataset_test.path + ".csv" , index=False, encoding='utf-8-sig')

#### Pipeline Component : Training XGBoost

In [7]:
@dsl.component(
    packages_to_install=["pandas", "xgboost==1.6.0", "scikit-learn==1.3.2"],  # Install xgboost instead of scikit-learn
    base_image="python:3.10.7-slim"
)
def train_xgboost(features: Input[Dataset], model: Output[Model]):
    '''Train an XGBoost Regression model with specific parameters'''
    import pandas as pd
    from xgboost import XGBRegressor  # Import XGBoost Regressor
    import pickle 
    
    # Load the dataset from the CSV file
    data = pd.read_csv(features.path + ".csv")
    
    # Initialize an XGBoost Regressor model with the specified parameters
    model_xgb = XGBRegressor(
        alpha=10,            # Regularization term (L1)
        reg_lambda=20,       # Regularization term (L2)
        learning_rate=0.1,   # Step size shrinkage
        n_estimators=500     # Number of boosting rounds (trees)
    )
    
    # Train the model using all columns except 'Median_House_Value' as features
    model_xgb.fit(data.drop('Median_House_Value', axis=1), data['Median_House_Value'])
    
    # Add metadata to indicate the model is an XGBoost Regressor model
    model.metadata["framework"] = "XGBoost"

    # Save the trained model as a pickle file
    file_name = model.path + ".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_xgb, file)

#### Pipeline Component : Training Random forest

In [9]:
@dsl.component(
    packages_to_install=["pandas", "scikit-learn==1.3.2"],  # Install scikit-learn for RandomForest
    base_image="python:3.10.7-slim"
)
def train_random_forest(features: Input[Dataset], model: Output[Model]):
    '''Train a Random Forest Regressor model with specific parameters'''
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor  # Import Random Forest Regressor
    import pickle 
    
    # Load the dataset from the CSV file
    data = pd.read_csv(features.path + ".csv")
    
    # Initialize a Random Forest Regressor model with the specified parameters
    model_rf = RandomForestRegressor(
        n_estimators=100,   # Number of trees in the forest
        max_depth=10        # Maximum depth of each tree
    )
    
    # Train the model using all columns except 'Median_House_Value' as features
    model_rf.fit(data.drop('Median_House_Value', axis=1), data['Median_House_Value'])
    
    # Add metadata to indicate the model is a Random Forest Regressor model
    model.metadata["framework"] = "RandomForest"

    # Save the trained model as a pickle file
    file_name = model.path + ".pkl"
    with open(file_name, 'wb') as file:  
        pickle.dump(model_rf, file)

#### Pipeline Component : Model Evaluation

See https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/ for NamedTuple

In [10]:
@dsl.component(
    packages_to_install = [
       "pandas", "scikit-learn==1.3.2", "numpy", "xgboost==1.6.0"
    ], base_image="python:3.10.7-slim"
)
def lr_model_evaluation(
    test_set: Input[Dataset],
    model: Input[Model],
    thresholds_dict_str: str,
    metrics: Output[Metrics],
    kpi: Output[Metrics]
) -> NamedTuple('outputs', [('approval', bool), ('r2', float)]):
    '''Evaluate the model and return approval and R² for comparison'''
  
    import pandas as pd
    import logging     
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import json
    import pickle
    import numpy as np
    
    logging.basicConfig(level=logging.INFO)
    
    # Load test data
    data = pd.read_csv(test_set.path + ".csv")
    
    # Load the saved model (XGBoost or RandomForest)
    m_filename = model.path + ".pkl"
    model = pickle.load(open(m_filename, 'rb'))
    
    # Prepare the test features and target variable
    X_test = data.drop(columns=["Median_House_Value"])
    y_true = data['Median_House_Value']
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate R² score
    r2 = r2_score(y_true, y_pred)

    logging.info(f"R²: {r2}")
    
    # Log metrics for future analysis
    metrics.log_metric("R2", r2)
    kpi.log_metric("R2", r2)
    
    # Parse the threshold dictionary for R² threshold
    thresholds_dict = json.loads(thresholds_dict_str)
    r2_threshold = thresholds_dict.get('r2', 0.7)  # Default R² threshold is 0.7 if not specified
    
    # Check if R² meets the threshold for approval
    approval_value = bool(r2 >= r2_threshold)
    logging.info(f"Approval based on R²: {approval_value} (R²: {r2}, Threshold: {r2_threshold})")
    
    # Return both approval status and R² for further use in the pipeline
    return approval_value, r2


#### Pipeline Component : Compare models

In [11]:
@dsl.component(
    base_image="python:3.10.7-slim"
)
def compare_model(xgb_r2: float, rf_r2: float) -> str:
    import logging
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    logging.info(f"XGBoost R²: {xgb_r2}")
    logging.info(f"Random Forest R²: {rf_r2}")
    
    # Compare based on R²
    if xgb_r2 > rf_r2:
        return "XGB"
    else:
        return "RF"

### Upload Model and Metrics to Google Bucket 

In [15]:
@dsl.component(
    packages_to_install=["google-cloud-storage"],
    base_image="python:3.10.7-slim"
)
def upload_model_to_gcs(project_id: str, model_repo: str, model: Input[Model]):
    '''upload model to gsc'''
    from google.cloud import storage   
    import logging 
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)    
  
    # upload the model to GCS
    client = storage.Client(project=project_id)
    bucket = client.bucket(model_repo)
    blob = bucket.blob('lr_model.pkl')
    source_file_name= model.path + '.pkl'
   
    blob.upload_from_filename(source_file_name)    
    
    print(f"File {source_file_name} uploaded to {model_repo}.")

### Trigger Another CI_CD Pipeline

In [None]:
# @dsl.component(
#     packages_to_install=["google-cloud-build"],
#     base_image="python:3.10.7-slim"
# )
# def run_build_trigger(project_id:str, trigger_id:str):
#     import sys
#     from google.cloud.devtools import cloudbuild_v1    
#     import logging 
#     logging.basicConfig(stream=sys.stdout, level=logging.INFO) 
    
#     # Create a client
#     client = cloudbuild_v1.CloudBuildClient()
#     name = f"projects/{project_id}/locations/us-central1/triggers/{trigger_id}"
#     # Initialize request argument(s)
#     request = cloudbuild_v1.RunBuildTriggerRequest(        
#         project_id=project_id,
#         trigger_id=trigger_id,
#         name=name
#     )

#     # Make the request
#     operation = client.run_build_trigger(request=request)
    
#     logging.info("Trigger the CI-CD Pipeline: " + trigger_id)


#### Define the Pipeline

_Watch out, use the documented definition statement when you want to automatically deploy the CICD pipeline_

In [23]:
# Define the workflow of the pipeline.

@kfp.dsl.pipeline(
    name="median_house_price_predictor_pipeline")
def pipeline(project_id: str, data_bucket: str, dataset_uri: str, model_repo: str, thresholds_dict_str:str):    
# def pipeline(project_id: str, data_bucket: str, dataset_uri: str, model_repo: str, thresholds_dict_str:str, trigger_id:str):     <-- Use this definition if the CICD pipeline is connected
    
    dataset_op = kfp.dsl.importer(
        artifact_uri=dataset_uri,
        artifact_class=Dataset,
        reimport=False,
    )
     
    train_test_split_op = train_test_split(dataset=dataset_op.output)
        
    training_xg_job_run_op = train_xgboost(features=train_test_split_op.outputs["dataset_train"])
    training_rf_job_run_op = train_random_forest(features=train_test_split_op.outputs["dataset_train"])
    
    
    model_evaluation_op_xg = lr_model_evaluation(
        test_set=train_test_split_op.outputs["dataset_test"],
        model=training_xg_job_run_op.outputs["model"],
        thresholds_dict_str=thresholds_dict_str, # I deploy the model anly if the model performance is above the threshold
    )
    
    model_evaluation_op_rf = lr_model_evaluation(
        test_set=train_test_split_op.outputs["dataset_test"],
        model=training_rf_job_run_op.outputs["model"],
        thresholds_dict_str=thresholds_dict_str, # I deploy the model anly if the model performance is above the threshold
    )
    
    # Compare models based on R²
    comp_model_op = compare_model(
        xgb_r2=model_evaluation_op_xg.outputs['r2'],
        rf_r2=model_evaluation_op_rf.outputs['r2']
    ).after(model_evaluation_op_xg, model_evaluation_op_rf)
    
    
    # Identify and approve the best model, then upload it to the GCS bucket if approved
    with dsl.If(comp_model_op.output == "XGB"):
        # XGBoost is better
        with dsl.If(model_evaluation_op_xg.outputs["approval"] == True):
            # Upload XGBoost model if it meets approval criteria
            upload_model_to_gc_op = upload_model_to_gcs(
                project_id=project_id,
                model_repo=model_repo,
                model=training_xg_job_run_op.outputs['model']
            )
    
    with dsl.If(comp_model_op.output == "RF"):
        # Random Forest is better
        with dsl.If(model_evaluation_op_rf.outputs["approval"] == True):
            # Upload Random Forest model if it meets approval criteria
            upload_model_to_gc_op = upload_model_to_gcs(
                project_id=project_id,
                model_repo=model_repo,
                model=training_rf_job_run_op.outputs['model']
            )  
        
#         trigger_model_deployment_cicd = run_build_trigger(
#             project_id=project_id,
#             trigger_id=trigger_id
#         ).after(upload_model_to_gc_op)  
      

#### Compile the pipeline into a JSON file

In [25]:
from kfp import compiler
compiler.Compiler().compile(pipeline_func=pipeline,
        package_path='median_house_price_predictor_pipeline.yaml')

#### Submit the pipeline run

In [27]:
import google.cloud.aiplatform as aip

# Before initializing, make sure to set the GOOGLE_APPLICATION_CREDENTIALS
# environment variable to the path of your service account.
aip.init(
    project=PROJECT_ID,
    location=REGION,
)

# Prepare the pipeline job
job = aip.PipelineJob(
    display_name="median_house_price_predictor_xgboost",
    enable_caching=False, # Make this False and True as necessary 
    template_path="median_house_price_predictor_pipeline.yaml",
    pipeline_root=PIPELINE_ROOT,
    location=REGION,
    parameter_values={
        'project_id': PROJECT_ID, # makesure to use your project id 
        'data_bucket': 'models_assignment1_de',  # makesure to use your data bucket name 
        'dataset_uri':'gs://data_de2024_assignment1_g4/California_Houses.csv',
        'model_repo':'models_assignment1_g4', # makesure to use your model bucket name 
        'thresholds_dict_str':'{"r2":0.8}',
        # 'trigger_id':'your trigger id' # makesure to use the id of the cloud build tigger deploying the model. For example, "id" field returned from runing gcloud builds triggers describe [your trigger name] --region=us-central1
    }
)

job.run()

DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.