In [33]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import kfp
import matplotlib.pyplot as plt
import pandas as pd
import requests

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, ClassificationMetrics, Metrics, component)

from google.cloud import aiplatform
from google.cloud import storage
from google.cloud.aiplatform import pipeline_jobs
from typing import NamedTuple

# We'll use this beta library for metadata querying
from google.cloud import aiplatform_v1beta1
from datetime import datetime

In [34]:
PROJECT_ID = 'qwiklabs-gcp-03-c2cbc3b6d290' # The Project ID
BUCKET_NAME = 'qwiklabs-gcp-03-c2cbc3b6d290'

%env BUCKET_NAME={BUCKET_NAME}

In [35]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"

PIPELINE_ROOT = f'gs://{BUCKET_NAME}/kfp_tf' # This is where all pipeline artifacts are sent. You'll need to ensure the bucket is created ahead of time
PIPELINE_ROOT

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin:/home/jupyter/.local/bin


'gs://qwiklabs-gcp-03-c2cbc3b6d290/kfp_tf'

In [36]:
@component(
    # this component builds the recommender model with BigQuery ML
    packages_to_install=["google-cloud-bigquery","tensorflow", "tensorflow_datasets", "pandas", "fsspec", "gcsfs","pyarrow","fastparquet"],
    base_image="python:3.9",
    output_component_file="output_component/data_prep.yaml"
)
def data_prep():
    import os
    import pprint
    import tempfile
    import pandas as pd

    from typing import Dict, Text

    import numpy as np
    import tensorflow as tf
    import tensorflow_datasets as tfds
    
    # Ratings data.
    ratings = pd.read_csv("gs://qwiklabs-gcp-03-c2cbc3b6d290-data/ml-25m/ratings.csv")
    # Features of all the available movies.
    movies = pd.read_csv("gs://qwiklabs-gcp-03-c2cbc3b6d290-data/ml-25m/movies.csv")
    
    df = ratings.merge(movies, how="left", on="movieId")
    df.to_parquet("gs://qwiklabs-gcp-03-c2cbc3b6d290-data/ml-25m/merged/df-rating-movie.parquet")

In [37]:
@component(
    # this component builds the recommender model with BigQuery ML
    packages_to_install=["google-cloud-bigquery", "sklearn","tensorflow", "tensorflow_datasets", "pandas", "fsspec", "gcsfs","pyarrow","fastparquet"],
    base_image="python:3.9",
    output_component_file="output_component/train_and_fit.yaml"
)
def train_and_fit():    
    # TODO: Move to train.py script
    import os
    import pprint
    import tempfile
    import pandas as pd

    from typing import Dict, Text

    import numpy as np
    import tensorflow as tf
    import tensorflow_datasets as tfds
    
    from tensorflow.keras import Model
    from tensorflow.keras import optimizers as opt
    from tensorflow.keras.layers import Embedding, multiply, concatenate, Flatten, Input, Dense
    
    from sklearn.model_selection import train_test_split
        
    df = pd.read_parquet("gs://qwiklabs-gcp-03-c2cbc3b6d290-data/ml-25m/merged/df-rating-movie.parquet")
    
    df_train, df_val = train_test_split(df, random_state=42, test_size=0.2, stratify=df.rating)
    
    movies_ids = list(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique())))
    users_ids = list(set(list(df_train.userId.unique()) + list(df_val.userId.unique())))

    dict_movies = {}
    index = 0
    for ids in sorted(movies_ids):
        dict_movies[ids] = index
        index += 1

    dict_users = {}
    index = 0
    for ids in sorted(users_ids):
        dict_users[ids] = index
        index += 1

    df_train["movieId"] = df_train["movieId"].map(dict_movies)
    df_val["movieId"] = df_val["movieId"].map(dict_movies)

    df_train["userId"] = df_train["userId"].map(dict_users)
    df_val["userId"] = df_val["userId"].map(dict_users)
    
    for col in ["userId", "movieId", "rating"]:
        df_train[col] = df_train[col].astype(np.float32)
        df_val[col] = df_val[col].astype(np.float32)
        
    num_unique_users=len(set(list(df_train.userId.unique()) + list(df_val.userId.unique())))
    num_unique_movies=len(set(list(df_train.movieId.unique()) + list(df_val.movieId.unique())))
    
    users_input = Input(shape=(1,), name="users_input")
    users_embedding = Embedding(num_unique_users + 1, 50, name="users_embeddings")(users_input)
    users_bias = Embedding(num_unique_users + 1, 1, name="users_bias")(users_input)

    movies_input = Input(shape=(1,), name="movies_input")
    movies_embedding = Embedding(num_unique_movies + 1, 50, name="movies_embedding")(movies_input)
    movies_bias = Embedding(num_unique_movies + 1, 1, name="movies_bias")(movies_input)

    dot_product_users_movies = multiply([users_embedding, movies_embedding])
    input_terms = dot_product_users_movies + users_bias + movies_bias
    input_terms = Flatten(name="fl_inputs")(input_terms)
    output = Dense(1, activation="relu", name="output")(input_terms)
    model = Model(inputs=[users_input, movies_input], outputs=output)
    
    opt_adam = opt.Adam(lr = 0.005)
    model.compile(optimizer=opt_adam, loss= ['mse'], metrics=['mean_absolute_error'])
    
    model.fit(x=[df_train.userId, df_train.movieId], y=df_train.rating, batch_size=512, epochs=3, verbose=1, validation_data=([df_val.userId, df_val.movieId], df_val.rating))
    
    OUTPUT_DIR = "gs://qwiklabs-gcp-03-c2cbc3b6d290/kfp_tf/model"
    #shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    #TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    EXPORT_PATH = os.path.join(OUTPUT_DIR, "1")

    tf.saved_model.save(model, EXPORT_PATH)

In [38]:
!(gcloud config get-value project)

# 'gs://qwiklabs-gcp-03-c2cbc3b6d290/kfp_tf'
# projects/770243501005/locations/us-central1/models/6101273585412734976
# h904fd1f45d8737c3p-tp

qwiklabs-gcp-03-c2cbc3b6d290


In [45]:
@component(
    # Deploys model 
    packages_to_install=["google-cloud-aiplatform", "joblib", "sklearn", "xgboost", "google-cloud-bigquery"],
    base_image="python:3.9",
    output_component_file="output_component/deploy_component.yaml"
)
def deploy(bucket_name):
    from google.cloud import aiplatform
    import os
    
    deployed_model = aiplatform.Model.upload(
        display_name='movie-recommender-keras',
        # artifact_uri = f'gs://qwiaklabs-gcp-03-c2cbc3b6d290/kfp_tf/model/5',
        artifact_uri = f'gs://{bucket_name}/kfp_tf/model/5',
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")
    # Save data to the output params
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name



In [46]:
SERVING_CONTAINER_IMAGE_URI = (
    'us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-3:latest'
)

In [47]:
uploaded_model = aiplatform.Model.upload(
    display_name='movie-recommender-keras',
    artifact_uri=f'gs://{BUCKET_NAME}/kfp_tf/model/5/',
    serving_container_image_uri=SERVING_CONTAINER_IMAGE_URI,
)

INFO:google.cloud.aiplatform.models:Creating Model
INFO:google.cloud.aiplatform.models:Create Model backing LRO: projects/770243501005/locations/us-central1/models/1858319786476306432/operations/8313482802880315392
INFO:google.cloud.aiplatform.models:Model created. Resource name: projects/770243501005/locations/us-central1/models/1858319786476306432
INFO:google.cloud.aiplatform.models:To use this Model in another session:
INFO:google.cloud.aiplatform.models:model = aiplatform.Model('projects/770243501005/locations/us-central1/models/1858319786476306432')


In [48]:
BUCKET_NAME = os.getenv("BUCKET_NAME")

@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline.
    name="pipeline-test",
    description='Movie Recommender Keras'
)
def pipeline(bucket=BUCKET_NAME):
    deploy(bucket)


In [42]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="pipeline.json"
)



In [43]:
PIPELINE_JSON="pipeline.json"

aiplatform.init(project=PROJECT_ID, location=REGION)

pipeline = aiplatform.PipelineJob(
    display_name="test-recommender-pipeline",
    template_path=PIPELINE_JSON,
    enable_caching=True,
)

pipeline.run(sync=False)

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/770243501005/locations/us-central1/pipelineJobs/pipeline-test-20220316053126
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/770243501005/locations/us-central1/pipelineJobs/pipeline-test-20220316053126')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/pipeline-test-20220316053126?project=770243501005
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/770243501005/locations/us-central1/pipelineJobs/pipeline-test-20220316053126 current state:
PipelineState.PIPELINE_STATE_PENDING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/770243501005/locations/us-central1/pipelineJobs/pipeline-test

In [44]:
# TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

# run = pipeline_jobs.PipelineJob(
#     display_name="test-pipeine",
#     template_path="pipeline.json",
    
#     job_id="test-{0}".format(TIMESTAMP),
#     enable_caching=False
# )

# run.run()

INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/770243501005/locations/us-central1/pipelineJobs/pipeline-test-20220316053126 current state:
PipelineState.PIPELINE_STATE_RUNNING
