In [None]:
 pip install kfp==1.8.22 protobuf==3.19.0

In [None]:
from google.cloud import aiplatform

In [None]:
REGION = "us-central1"
PROJECT_ID = !(gcloud config get-value project)
PROJECT_ID = PROJECT_ID[0]

In [None]:
# Set `PATH` to include the directory containing KFP CLI
PATH = %env PATH
%env PATH=/home/desktop/.local/bin:{PATH}

In [None]:
IMAGE_NAME = "detect_llm_trainer_image"
TAG = "latest"
TRAINING_CONTAINER_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{TAG}"
TRAINING_CONTAINER_IMAGE_URI

In [None]:
TRAINING_CONTAINER_IMAGE_URI

In [None]:
ls

In [None]:
!gcloud builds submit --timeout 15m --tag $TRAINING_CONTAINER_IMAGE_URI detect_llm_vertex_trainer

In [None]:

SERVING_CONTAINER_IMAGE_URI = (
    "us-docker.pkg.dev/vertex-ai/prediction/tf2-cpu.2-13:latest"
)


In [None]:
from pipeline_vertex.training_lightweight_component import train_and_deploy
from pipeline_vertex.tuning_lightweight_component import tune_hyperparameters

In [None]:
%%writefile ./pipeline_vertex/pipeline.py
# Copyright 2021 Google LLC

# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at

# https://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.
"""Kubeflow Detect LLM Pipeline."""
import os

from kfp import dsl
from training_lightweight_component import train_and_deploy
from tuning_lightweight_component import tune_hyperparameters

PIPELINE_ROOT = os.getenv("PIPELINE_ROOT")
PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")

TRAINING_CONTAINER_IMAGE_URI = os.getenv("TRAINING_CONTAINER_IMAGE_URI")
SERVING_CONTAINER_IMAGE_URI = os.getenv("SERVING_CONTAINER_IMAGE_URI")

TRAINING_FILE_PATH = os.getenv("TRAINING_FILE_PATH")
VALIDATION_FILE_PATH = os.getenv("VALIDATION_FILE_PATH")
TEST_FILE_PATH = os.getenv("TEST_FILE_PATH")


MAX_TRIAL_COUNT = int(os.getenv("MAX_TRIAL_COUNT", "1"))
PARALLEL_TRIAL_COUNT = int(os.getenv("PARALLEL_TRIAL_COUNT", "1"))
THRESHOLD = float(os.getenv("THRESHOLD", "0.8"))

@dsl.pipeline(
    name="detect-llm-kfp-pipeline",
    description="The pipeline training and deploying the detect_llm classifier",
    pipeline_root=PIPELINE_ROOT,
)
def detect_llm_train(
    training_container_uri: str = TRAINING_CONTAINER_IMAGE_URI,
    serving_container_uri: str = SERVING_CONTAINER_IMAGE_URI,
    training_file_path: str = TRAINING_FILE_PATH,
    validation_file_path: str = VALIDATION_FILE_PATH,
    test_file_path: str = TEST_FILE_PATH,
    auc_deployment_threshold: float = THRESHOLD,
    max_trial_count: int = MAX_TRIAL_COUNT,
    parallel_trial_count: int = PARALLEL_TRIAL_COUNT,
    pipeline_root: str = PIPELINE_ROOT,
):
    staging_bucket = f"{pipeline_root}/staging"

    tuning_op = tune_hyperparameters(
        project=PROJECT_ID,
        location=REGION,
        container_uri=training_container_uri,
        training_file_path=training_file_path,
        validation_file_path=validation_file_path,
        test_file_path=test_file_path,
        
        staging_bucket=staging_bucket,
        max_trial_count=max_trial_count,
        parallel_trial_count=parallel_trial_count,
    )

    auc = tuning_op.outputs["best_roc_auc"]

    with dsl.Condition(
        auc >= auc_deployment_threshold, name="deploy_decision"
    ):
        train_and_deploy_op = (  # pylint: disable=unused-variable
            train_and_deploy(
                project=PROJECT_ID,
                location=REGION,
                container_uri=training_container_uri,
                serving_container_uri=serving_container_uri,
                training_file_path=training_file_path,
                validation_file_path=validation_file_path,
                test_file_path=test_file_path,
                staging_bucket=staging_bucket,
                dropout=tuning_op.outputs["best_dropout"],
                embedding_dim=tuning_op.outputs["best_embedding_dim"],
                hidden_dim=tuning_op.outputs["best_hidden_dim"],
                max_features=tuning_op.outputs["best_max_features"],
                sequence_length=tuning_op.outputs["best_sequence_length"],
            )
        )

In [None]:
ARTIFACT_STORE = f"gs://{PROJECT_ID}-kfp-artifact-store"
PIPELINE_ROOT = f"{ARTIFACT_STORE}/pipeline"
DATA_ROOT = f"{ARTIFACT_STORE}/data"

TRAINING_FILE_PATH = f"{DATA_ROOT}/training/train_df.csv"
VALIDATION_FILE_PATH = f"{DATA_ROOT}/validation/validation_df.csv"
TEST_FILE_PATH = f"{DATA_ROOT}/test/test_df.csv"      

%env PIPELINE_ROOT={PIPELINE_ROOT}
%env PROJECT_ID={PROJECT_ID}
%env REGION={REGION}
%env SERVING_CONTAINER_IMAGE_URI={SERVING_CONTAINER_IMAGE_URI}
%env TRAINING_CONTAINER_IMAGE_URI={TRAINING_CONTAINER_IMAGE_URI}
%env TRAINING_FILE_PATH={TRAINING_FILE_PATH}
%env VALIDATION_FILE_PATH={VALIDATION_FILE_PATH}
%env TEST_FILE_PATH={TEST_FILE_PATH}

In [None]:
!gsutil ls | grep ^{ARTIFACT_STORE}/$ || gsutil mb -l {REGION} {ARTIFACT_STORE}

In [None]:
!gsutil -m cp -r gs://{PROJECT_ID}/detect-llm/data/* {DATA_ROOT}

In [None]:
!gsutil ls  {DATA_ROOT}

In [None]:
PIPELINE_JSON = "detect_llm_kfp_pipeline.json"

In [None]:
!dsl-compile-v2 --py pipeline_vertex/pipeline.py --output $PIPELINE_JSON

In [None]:
!head {PIPELINE_JSON}

In [None]:
import os

In [None]:
aiplatform.init(project=PROJECT_ID, location=REGION)

pipeline = aiplatform.PipelineJob(
    display_name="detect_llm_kfp_pipeline",
    template_path=PIPELINE_JSON,
    enable_caching=False,
)

pipeline.run()

In [None]:
from google.cloud import aiplatform
REGION = "us-central1"
PROJECT_ID = !(gcloud config get-value project)
PROJECT_ID = PROJECT_ID[0]

In [None]:
PROJECT_NUMBER = !(gcloud projects describe $PROJECT_ID --format="value(projectNumber)")
PROJECT_NUMBER = PROJECT_NUMBER[0]

In [None]:
!gcloud ai endpoints list --region=$REGION

In [None]:
endpoint_id=!(gcloud ai endpoints list --region=us-central1 | awk 'NR>1 {print $1}')
endpoint_id=int(endpoint_id[1])
endpoint = aiplatform.Endpoint(f'projects/{PROJECT_NUMBER}/locations/{REGION}/endpoints/{endpoint_id}')

In [None]:
endpoint.predict([["Sample text to predict, this is not generated text but we need student article text to test."]])

In [None]:
endpoint.undeploy_all()