### Imports


In [None]:
import kfp

from kfp.dsl import pipeline
from kfp.dsl import component
from kfp.dsl import OutputPath

from kfp.dsl import (
    Artifact,
    Dataset,
    Input,
    Model,
    Output,
    Metrics,
    component,
    Markdown,
    HTML,
)

from kfp import compiler

from google.cloud import aiplatform

import json

from rich import print

In [None]:
from dotenv import dotenv_values

config = dotenv_values(".env")
PROJECT_ID = config["PROJECT_ID"]
PIPELINE_ROOT = config["PIPELINE_ROOT"]
LOCATION = config["LOCATION"]
SERVICE_ACCOUNT = config["SERVICE_ACCOUNT"]

### Authentication


In [None]:
aiplatform.init(
    project=PROJECT_ID,
    staging_bucket=PIPELINE_ROOT,
    location=LOCATION,
)

### Download Data


In [None]:
@component(
    packages_to_install=[
        "pandas==2.2.2",
        "google-cloud-aiplatform==1.49.0",
        "google-cloud-bigquery==3.15.0",
        "pyarrow==12.0.1",
        "db-dtypes==1.1.1",
    ],
    base_image="python:3.10.6",
)
def download_data(table_id: str, credentials: dict, dataset: Output[Dataset]):
    import pandas as pd
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")
    query = f"""
    SELECT * FROM `{table_id}`
    WHERE 
     DATETIME_DIFF( CURRENT_TIMESTAMP(), modified_date_time, MINUTE) < 30
    """
    df = client.query(query).to_dataframe()

    df.to_csv(dataset.path, index=False)

### Process Data


In [None]:
@component(
    packages_to_install=[
        "pandas==2.2.2",
        "requests==2.28.1",
        "Pillow==9.2.0",
        "vertexai==1.49.0",
        "pyarrow==12.0.1",
    ],
    base_image="python:3.10.6",
)
def process_data(
    input_data: Input[Dataset],
    output_data: Output[Dataset],
    project_id: str,
    location: str,
):
    import pandas as pd
    import requests
    import PIL
    from vertexai.vision_models import Image, MultiModalEmbeddingModel
    import vertexai

    vertexai.init(project=project_id, location=location)

    model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

    df = pd.read_csv(input_data.path)

    df["image_embedding"] = [list() for _ in range(df.shape[0])]
    df["text_embedding"] = [list() for _ in range(df.shape[0])]

    image_embedding_column = df.columns.get_loc("image_embedding")
    text_embedding_column = df.columns.get_loc("text_embedding")

    def download_image(url, filename):
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
        else:
            print(f"Error downloading image {url}")

    for i in range(df.shape[0]):
        print(i)
        download_image(df.image_url[i], "/tmp/thumbnail.jpg")

        img = PIL.Image.open("/tmp/thumbnail.jpg")
        if img.format == "WEBP":
            img = img.convert("RGB")
            img.save("/tmp/thumbnail.jpg", "JPEG")

        image = Image.load_from_file("/tmp/thumbnail.jpg")
        embeddings = model.get_embeddings(
            image=image,
            contextual_text=df.description[i],
        )
        df.iat[i, image_embedding_column] = embeddings.image_embedding
        df.iat[i, text_embedding_column] = embeddings.text_embedding

    df.to_parquet(output_data.path, index=False)

### Update BQ Table


In [None]:
@component(
    packages_to_install=[
        "pandas==2.2.2",
        "google-cloud-bigquery==3.15.0",
        "db-dtypes==1.1.1",
        "pyarrow==12.0.1",
    ],
    base_image="python:3.10.6",
)
def update_bq_table(input_data: Input[Dataset], credentials: dict):
    import pandas as pd
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")
    df = pd.read_parquet(input_data.path)

    query = """SELECT * FROM `aicamp_2024.similar_table`"""
    existing_table = client.query(query).to_dataframe()

    df = pd.concat([df, existing_table], ignore_index=True)
    df["modified_date_time"] = pd.to_datetime(df["modified_date_time"])
    df["item_id"] = df["item_id"].astype(str)
    df = df.sort_values("modified_date_time", ascending=False).drop_duplicates(
        "item_id", keep="first"
    )

    def dataframe_to_bq_table(df, table_id, write_disposition):
        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = write_disposition
        job_config.create_disposition = "CREATE_IF_NEEDED"

        job = client.load_table_from_dataframe(df, table_id, job_config=job_config)

    dataframe_to_bq_table(df, "aicamp_2024.similar_table", "WRITE_TRUNCATE")

In [None]:
@component(
    packages_to_install=[
        "google-cloud-aiplatform==1.49.0",
        "google-cloud-bigquery==3.15.0",
        "db-dtypes==1.1.1",
        "pyarrow==12.0.1",
        "tabulate",
    ],
    base_image="python:3.10.6",
)
def get_similar_items(
    credentials: dict,
    output_sample_1: Output[Markdown],
    output_sample_2: Output[Markdown],
    output_sample_3: Output[Markdown],
):
    from google.cloud import bigquery
    import os
    import json

    credentials_info = json.loads(json.dumps(credentials))
    with open("credentials.json", "w") as f:
        json.dump(credentials_info, f)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./credentials.json"

    client = bigquery.Client(location="EU")
    query = """
        SELECT 
            query.item_id, 
            base.item_id as similar_item_id,
            distance, 
            query.image_url as query_image_url,
            base.image_url as base_image_url,
        FROM
            VECTOR_SEARCH(
                TABLE `aicamp_2024.similar_table`, 
                'image_embedding',
                TABLE  `aicamp_2024.similar_table`, 
                query_column_to_search => 'image_embedding',
                top_k => 10, distance_type => 'COSINE'
            )
        WHERE query.item_id != base.item_id
        ORDER BY distance 
    """
    df = client.query(query).to_dataframe()

    print(df.head())

    sample_item_ids = df[df.distance < 0.2].item_id.sample(3)

    def write_sample(output_sample, s):
        with open(output_sample.path, "w") as f:
            f.write(s)

    for i in range(3):
        item_id = sample_item_ids.iloc[i]
        sample = df[df.item_id == item_id].copy()
        sample["query_image_url"] = sample["query_image_url"].apply(
            lambda x: f"<img src='{x}' width='100'>"
        )
        sample["base_image_url"] = sample["base_image_url"].apply(
            lambda x: f"<img src='{x}' width='100'>"
        )
        output_sample = sample.to_markdown(index=False)
        if i == 0:
            write_sample(output_sample_1, output_sample)
        elif i == 1:
            write_sample(output_sample_2, output_sample)
        else:
            write_sample(output_sample_3, output_sample)

### Pipeline


In [None]:
@pipeline(
    name="similar_pipeline",
    pipeline_root=PIPELINE_ROOT + "similar_pipeline",
)
def similar_pipeline():

    with open("service_account.json", "r") as f:
        raw_credential = json.load(f)

    download_data_job = (
        download_data(table_id="aicamp_2024.similar_view", credentials=raw_credential)
        .set_cpu_limit("2")
        .set_memory_limit("8G")
    )
    print(download_data_job.outputs)

    process_data_job = (
        process_data(
            input_data=download_data_job.outputs["dataset"],
            project_id=PROJECT_ID,
            location=LOCATION,
        )
        .set_cpu_limit("2")
        .set_memory_limit("8G")
    )
    print(process_data_job.outputs)

    update_bq_table_job = (
        update_bq_table(
            input_data=process_data_job.outputs["output_data"],
            credentials=raw_credential,
        )
        .set_cpu_limit("2")
        .set_memory_limit("16G")
    )
    print(update_bq_table_job.outputs)

    get_similar_items_job = (
        get_similar_items(credentials=raw_credential)
        .after(update_bq_table_job)
        .set_cpu_limit("2")
        .set_memory_limit("16G")
    )
    print(get_similar_items_job.outputs)

In [None]:
compiler.Compiler().compile(
    pipeline_func=similar_pipeline, package_path="pipelines/similar_pipeline.json"
)

In [None]:
job = aiplatform.PipelineJob(
    display_name="testing_similar_pipeline",
    template_path="pipelines/similar_pipeline.json",
)
job.run()