In [None]:
USER_FLAG = "--user"
!pip3 install {USER_FLAG} google-cloud-aiplatform --upgrade
!pip3 install {USER_FLAG} kfp google-cloud-pipeline-components
!pip3 install {USER_FLAG} fsspec gcsfs scikit-learn
!pip3 install -U google-cloud-aiplatform "shapely>2"
!pip3 install docstring-parser
!pip3 install kfp

In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)


In [None]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"
!python3 -c "import docstring_parser"
!python3 -c "import kfp.pipeline_spec"

In [None]:
import os
PROJECT_ID = ""
if not os.getenv("IS_TESTING"):
    shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    BUCKET_NAME="gs://" + PROJECT_ID + "-bucket"
    REGION='us-central1'
    os.environ['PROJECT_ID'] = PROJECT_ID
    os.environ['BUCKET_NAME'] = BUCKET_NAME
    os.environ['REGION'] = REGION
    print("Project ID: ", PROJECT_ID)
    print("Bucket Name: ", BUCKET_NAME)
    print("Region: ", REGION)


In [None]:
!wget https://storage.googleapis.com/partner-usecase-bucket/ucase011/dataset_large.csv
!wget https://storage.googleapis.com/partner-usecase-bucket/ucase011/dataset_small.csv

In [None]:
!mkdir data
!mv dataset_large.csv data
!mv dataset_small.csv data

In [None]:
! gsutil cp ./data/dataset_large.csv $BUCKET_NAME
! gsutil cp ./data/dataset_small.csv $BUCKET_NAME

In [None]:
import os
import pprint as pp
import sys
import docstring_parser
import kfp.pipeline_spec
import pickle
import argparse

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from kfp import compiler, dsl
from kfp.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath

from google.cloud import aiplatform

# We'll use this namespace for metadata querying
from google.cloud import aiplatform_v1


In [None]:
url = BUCKET_NAME+"/dataset_large.csv"
data = pd.read_csv(url)
data.head()

In [None]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

In [None]:
modelling_columns = ["Area" 
                     , "Perimeter" 
                     ,"MajorAxisLength" 
                     ,"MinorAxisLength" 
                     ,"AspectRation" 
                     , "Eccentricity" 
                     , "ConvexArea" 
                     , "EquivDiameter" 
                     , "Extent" 
                     , "Solidity" 
                     , "roundness" 
                     , "Compactness" 
                     , "ShapeFactor1" 
                     , "ShapeFactor2" 
                     , "ShapeFactor3"
                     , "ShapeFactor4"
                     , "Class"
                    ]
data = data[modelling_columns]

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from joblib import dump
from kfp.dsl import Metrics
import pandas as pd
import docstring_parser
import kfp.pipeline_spec
import kfp.dsl
df = pd.read_csv(url)
labels = df.pop("Class").tolist()
data = df.values.tolist()
x_train, x_test, y_train, y_test = train_test_split(data, labels)

skmodel = DecisionTreeClassifier()
skmodel.fit(x_train,y_train)
score = skmodel.score(x_test,y_test)
print('accuracy is:',score)

metrics = Metrics()
metrics.log_metric("accuracy",(score * 100.0))
metrics.log_metric("framework", "Scikit Learn")
metrics.log_metric("dataset_size", len(df))

MODEL_PATH=BUCKET_NAME+"/models/"
model_path = "./" + "model.pkl"
with open(model_path, 'wb') as file:  
    pickle.dump(skmodel, file)
    
#copy model artifacts to GCS storage
!gsutil cp "model.pkl" $MODEL_PATH


In [None]:
#Prediction containers list available at : https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
serving_container_uri = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest"

#define GCS location for model artifacts
artifact_uri = MODEL_PATH

#Upload Model to Vertex AI Model Registry using Python SDK
model = aiplatform.Model.upload(display_name= "vertex-metadata" ,
                                    artifact_uri=artifact_uri,
                                    serving_container_image_uri=serving_container_uri)


In [None]:
#Create the model endpoint using Python SDK
endpoint = model.deploy(machine_type="n1-standard-4",
                        min_replica_count=1,
                        max_replica_count=1)

## This step can take up to 15 minutes to complete! Wait for the asterisk next to the cell to turn into a number before proceeding to the next cell!!

In [None]:
@component(
    packages_to_install=["pandas", "pyarrow", "db-dtypes", "fsspec", "gcsfs", "docstring_parser", "kfp.pipeline_spec", "kfp.dsl"],
    base_image="python:3.9",
    output_component_file="create_dataset.yaml"
)
def get_dataframe(
    bucket_name: str,
    file_name: str,
    output_data_path: OutputPath("Dataset")
):
    import pandas as pd
    import os

    project_number = os.environ["CLOUD_ML_PROJECT_ID"]
    
    url = bucket_name+"/" + file_name
    df = pd.read_csv(url)
    df.to_csv(output_data_path)

In [None]:
@component(
    packages_to_install=["scikit-learn==1.4.2", "numpy==1.24.4", "pandas", "joblib", "db-dtypes", "google-cloud-storage", "docstring_parser", "kfp.pipeline_spec", "kfp.dsl"],
    base_image="python:3.9",
    output_component_file="beans_model_component.yaml",
)
def sklearn_train(
    project_id: str,
    bucket_name: str,
    dataset: Input[Dataset],
    metrics: Output[Metrics],
    model: Output[Model]
):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.metrics import roc_curve
    from sklearn.model_selection import train_test_split
    from joblib import dump
    from kfp.dsl import Metrics
    from google.cloud import storage
    import pandas as pd
    import os
    import pickle
    
    df = pd.read_csv(dataset.path)
    labels = df.pop("Class").tolist()
    data = df.values.tolist()
    x_train, x_test, y_train, y_test = train_test_split(data, labels)

    model = DecisionTreeClassifier()
    model.fit(x_train,y_train)
    score = model.score(x_test,y_test)
    print('accuracy is:',score)
    
    metrics.log_metric("accuracy",(score * 100.0))
    metrics.log_metric("framework", "Scikit Learn")
    metrics.log_metric("dataset_size", len(df))
    
    model_path = "./" + "model.pkl"
    with open(model_path, 'wb') as file:
        pickle.dump(model, file)
    
    #copy model artifact to GCS storage
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(f"{project_id}-bucket")
    blob = bucket.blob("models/model.pkl")
    blob.upload_from_filename("model.pkl")

    print("File uploaded to {}.".format(bucket.name))
    

In [None]:
@component(
    packages_to_install=["google-cloud-aiplatform", "scikit-learn==1.4.2", "numpy==1.24.4", "pandas", "joblib", "db-dtypes", "google-cloud-storage", "docstring_parser", "kfp.pipeline_spec", "kfp.dsl"],
    base_image="python:3.9",
    output_component_file="beans_deploy_component.yaml",
)
def deploy_model(
    model: Input[Model],
    project: str,
    region: str,
    bucket_name: str,
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    import os
    
    MODEL_PATH=bucket_name+"/models/"

    aiplatform.init(project=project, location=region)

    deployed_model = aiplatform.Model.upload(
        display_name="beans-model-pipeline",
        artifact_uri = MODEL_PATH,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")

    # Save data to the output params
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name

In [None]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"

PIPELINE_ROOT = f"{BUCKET_NAME}/pipeline_root/"
PIPELINE_ROOT

In [None]:
FILE_NAME="dataset_small.csv"

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from kfp import compiler, dsl
from kfp.dsl import pipeline, component, Artifact, Dataset, Input, Metrics, Model, Output, InputPath, OutputPath
import docstring_parser
import kfp.pipeline_spec
import kfp.dsl
from google.cloud import aiplatform

# We'll use this namespace for metadata querying
from google.cloud import aiplatform_v1

@pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline.
    name="mlmd-pipeline",
)
def pipeline(
    bucket_name: str = BUCKET_NAME,
    file_name: str = FILE_NAME,
    output_data_path: str = "data.csv",
    project: str = PROJECT_ID,
    region: str = REGION
):
    dataset_task = get_dataframe(bucket_name=BUCKET_NAME, file_name=FILE_NAME)
    print(dataset_task.outputs)
    model_task = sklearn_train(
        project_id = PROJECT_ID,
        bucket_name = BUCKET_NAME,
        dataset = dataset_task.outputs['output_data_path']
    )

    deploy_task = deploy_model(
        model=model_task.outputs["model"],
        project=PROJECT_ID,
        region=REGION,
        bucket_name=BUCKET_NAME
    )



In [None]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="mlmd_pipeline.json"
)

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")


In [None]:
run1 = aiplatform.PipelineJob(
    display_name="mlmd-pipeline",
    template_path="mlmd_pipeline.json",
    job_id="mlmd-pipeline-small-{0}".format(TIMESTAMP),
    parameter_values={"bucket_name": os.environ["BUCKET_NAME"], "file_name":f"{FILE_NAME}", "project": os.environ["PROJECT_ID"], "region": os.environ["REGION"]},
    enable_caching=True,
)


In [None]:
run1.submit()

In [None]:
FILE_NAME="dataset_large.csv"

In [None]:
run2 = aiplatform.PipelineJob(
    display_name="mlmd-pipeline",
    template_path="mlmd_pipeline.json",
    job_id="mlmd-pipeline-large-{0}".format(TIMESTAMP),
    parameter_values={"bucket_name": os.environ["BUCKET_NAME"], "file_name":f"{FILE_NAME}", "project": os.environ["PROJECT_ID"], "region": os.environ["REGION"]},
    enable_caching=True,
)


In [None]:
run2.submit()

In [None]:
API_ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)
metadata_client = aiplatform_v1.MetadataServiceClient(
  client_options={
      "api_endpoint": API_ENDPOINT
  }
)

In [None]:
MODEL_FILTER="schema_title = \"system.Model\""
artifact_request = aiplatform_v1.ListArtifactsRequest(
    parent="projects/{0}/locations/{1}/metadataStores/default".format(PROJECT_ID, REGION),
    filter=MODEL_FILTER
)
model_artifacts = metadata_client.list_artifacts(artifact_request)

In [None]:
LIVE_FILTER = "create_time > \"2021-08-10T00:00:00-00:00\" AND state = LIVE"
artifact_req = {
    "parent": "projects/{0}/locations/{1}/metadataStores/default".format(PROJECT_ID, REGION),
    "filter": LIVE_FILTER
}
live_artifacts = metadata_client.list_artifacts(artifact_req)


In [None]:
data = {'uri': [], 'createTime': [], 'type': []}

for i in live_artifacts:
    data['uri'].append(i.uri)
    data['createTime'].append(i.create_time)
    data['type'].append(i.schema_title)

df = pd.DataFrame.from_dict(data)
df
