### Imports


In [None]:
import kfp

from kfp.dsl import pipeline
from kfp.dsl import component
from kfp.dsl import OutputPath

from kfp import dsl

from kfp.v2.dsl import (
    Artifact,
    Dataset,
    Input,
    Model,
    Output,
    Metrics,
    component,
    Markdown,
    HTML,
)

from kfp import compiler

from google.cloud import aiplatform

import json

from rich import print

In [None]:
from dotenv import dotenv_values

config = dotenv_values(".env")
PROJECT_ID = config["PROJECT_ID"]
PIPELINE_ROOT = config["PIPELINE_ROOT"]
LOCATION = config["LOCATION"]
SERVICE_ACCOUNT = config["SERVICE_ACCOUNT"]

### Authentication


In [None]:
aiplatform.init(
    project=PROJECT_ID,
    staging_bucket=PIPELINE_ROOT,
    location=LOCATION,
)

### Train Model


In [None]:
@component(
    packages_to_install=[
        "google-cloud-bigquery==3.15.0",
    ],
    base_image="python:3.10.6",
)
def run_mat_fact_model(credentials: dict):
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")

    query = f"""
    CREATE OR REPLACE MODEL `aicamp_2024.cf_model`
        OPTIONS
        (model_type='matrix_factorization',
        feedback_type='implicit',
        user_col='user_id',
        item_col='item_id',
        rating_col='rating',
        l2_reg=30,
        num_factors=15) AS
        SELECT
        user_id,
        item_id,
        LOG(view_duration) AS rating,
        FROM `aicamp_2024.cf_view`
    """
    client.query_and_wait(query)

### Evaluate Model


In [None]:
from typing import NamedTuple


@component(
    packages_to_install=[
        "pandas==2.2.2",
        "google-cloud-bigquery==3.15.0",
        "pyarrow==12.0.1",
        "db-dtypes==1.1.1",
        "tabulate",
    ],
    base_image="python:3.10.6",
)
def evaluate_model(
    credentials: dict,
    evaluation_metrics: Output[Markdown],
) -> NamedTuple("Outputs", [("deployment_decision", str)]):
    import pandas as pd
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")
    query = f"""
        SELECT
            *
        FROM
            ML.EVALUATE(MODEL `aicamp_2024.cf_model`)
    """
    df = client.query(query).to_dataframe()

    with open(evaluation_metrics.path, "w") as f:
        f.write(df.T.to_markdown())

    if df["mean_average_precision"].values[0] > 0.1:
        deployment_decision = "Deploy"
    else:
        deployment_decision = "Do not deploy"
    return (deployment_decision,)

### Create Table with Recommendations


In [None]:
@component(
    packages_to_install=[
        "pandas==2.2.2",
        "google-cloud-bigquery==3.15.0",
        "pyarrow==12.0.1",
        "db-dtypes==1.1.1",
        "tabulate",
    ],
    base_image="python:3.10.6",
)
def create_recommendation_table(
    credentials: dict,
    recommendation_table: Output[Dataset],
    sample_output: Output[Markdown],
):
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")

    query = f"""
    SELECT
        user_id,
        ARRAY_AGG(item_id
            ORDER BY predicted_rating_confidence DESC LIMIT 5) AS recommended_items
    FROM (
        SELECT
        *
        FROM
        ML.RECOMMEND(MODEL `aicamp_2024.cf_model`)
        )
    GROUP BY
        user_id
    """
    df = client.query(query).to_dataframe()

    df.to_parquet(recommendation_table.path)

    with open(sample_output.path, "w") as f:
        f.write(df.sample(5).to_html())

### Pipeline


In [None]:
@pipeline(
    name="collaborative_filtering_pipeline",
    pipeline_root=PIPELINE_ROOT + "collaborative_filtering_pipeline",
)
def collaborative_filtering_pipeline():
    with open("service_account.json", "r") as f:
        raw_credential = json.load(f)

    run_mat_fact_model_task = (
        run_mat_fact_model(credentials=raw_credential)
        .set_cpu_limit("1")
        .set_memory_limit("1G")
    )

    evaluate_model_task = (
        evaluate_model(credentials=raw_credential)
        .set_cpu_limit("1")
        .set_memory_limit("1G")
        .after(run_mat_fact_model_task)
    )

    with dsl.If(
        evaluate_model_task.outputs["deployment_decision"] == "Deploy",
        name="deploy_decision",
    ):
        create_recommendation_table_task = (
            create_recommendation_table(credentials=raw_credential)
            .set_cpu_limit("1")
            .set_memory_limit("1G")
            .after(evaluate_model_task)
        )

For more information about control flow, you can visit the following link:
https://www.kubeflow.org/docs/components/pipelines/v2/pipelines/control-flow/


In [None]:
compiler.Compiler().compile(
    pipeline_func=collaborative_filtering_pipeline,
    package_path="pipelines/collaborative_filtering_pipeline.json",
)

In [None]:
job = aiplatform.PipelineJob(
    display_name="collaborative_filtering_pipeline",
    template_path="pipelines/collaborative_filtering_pipeline.json",
    enable_caching=True,
)
job.run()