`gcloud init`

`gcloud compute ssh --zone us-central1-a <bucket-name> --internal-ip`

In [None]:
# GCP initialization
PROJECT_ID = <your-project-id>
! gcloud config set project {PROJECT_ID}
REGION = "us-central1"
BUCKET_NAME = REPOSITORY_NAME = "lh-sandbox"
BUCKET_URI = f"gs://{BUCKET_NAME}"
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}
shell_output = !gcloud auth list
SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
! gsutil iam ch service`Account:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
PIPELINE_IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY_NAME}/gpt:latest"
DEPLOYMENT_IMAGE_URI = f"{REGION}-docker.pkg.dev/{PROJECT_ID}/{REPOSITORY_NAME}/gpt_deployment:latest"

In [None]:
# %%time
# # run once if you need to build images:
# # !gcloud auth configure-docker {REGION}-docker.pkg.dev # https://cloud.google.com/artifact-registry/docs/docker/authentication
# !docker build --no-cache -t $PIPELINE_IMAGE_URI . 
# !docker push $PIPELINE_IMAGE_URI

In [None]:
import os
import pickle

import kfp
import tiktoken_ext.openai_public
from kfp.dsl import Input, Output, Dataset, Model, Artifact # https://www.kubeflow.org/docs/components/pipelines/v2/data-types/artifacts/
from google.cloud import storage
from google.cloud import aiplatform

In [None]:
# initialize the Vertex AI SDK for your project and bucket
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
# upload necessary dataset and artifacts to bucket
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)

# upload dataset
blob_data = bucket.blob('data/shakespeare.txt')
blob_data.upload_from_filename('data/shakespeare.txt')

# upload tiktoken config
tiktoken_config = tiktoken_ext.openai_public.gpt2() # encoder/decoder definition to parse dataset
with open('./model_artifacts/tiktoken_config.pkl', 'wb') as f:
    pickle.dump(tiktoken_config, f)
blob_tiktoken_config = bucket.blob('model_artifacts/tiktoken_config.pkl')
blob_tiktoken_config.upload_from_filename('./model_artifacts/tiktoken_config.pkl')

# upload model config
model_config = dict(
    batch_size = 64, # N
    vocab_dim = tiktoken_config['explicit_n_vocab'],
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 4,
    dropout = 0.2,
    train_steps = 10000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
os.makedirs('./model_artifacts', exist_ok=True)
with open('./model_artifacts/model_config.pkl', 'wb') as f:
    pickle.dump(model_config, f)
blob_model_config = bucket.blob('model_artifacts/model_config.pkl')
blob_model_config.upload_from_filename('./model_artifacts/model_config.pkl')

In [None]:
# component definitions
# https://www.kubeflow.org/docs/components/pipelines/v2/components/containerized-python-components/

@kfp.dsl.component(base_image=PIPELINE_IMAGE_URI)
def train_model(bucket_name: str, path_model_config: str, path_tiktoken_config: str, path_text_data: str, model_artifacts: Output[Artifact]):
    import os
    import pickle

    import torch
    from google.cloud import storage

    from model import GPT
    from dataset import WordDataset


    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    
    blob_text_data = bucket.blob(path_text_data)
    blob_tiktoken_config = bucket.blob(path_tiktoken_config)
    blob_model_config = bucket.blob(path_model_config)

    with blob_text_data.open(mode='r') as f:
        text_data = f.read()
    with blob_tiktoken_config.open(mode='rb') as f:
        tiktoken_config = pickle.load(f)
    with blob_model_config.open(mode='rb') as f:
        model_config = pickle.load(f)
        assert model_config['embed_dim'] % model_config['num_heads'] == 0

    # Initalize dataset
    dataset_shakespeare = WordDataset(text_data, seq_len=model_config['sequence_dim'], tiktoken_config=tiktoken_config)
    n = int(.95*len(dataset_shakespeare))
    dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
    dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))
    
    # Initialize model and parameters
    torch.manual_seed(model_config['seed']) # for reproducible experiments; but may slow down model
    model = GPT(
        dataset_shakespeare.vocab_dim,
        model_config['sequence_dim'],
        model_config['embed_dim'],
        model_config['num_heads'],
        model_config['num_layers'],
        dropout=model_config['dropout'],
        device=model_config['device'],
    )
    optimizer = torch.optim.AdamW(model.parameters(), lr=model_config['lr'])
    epochs = 10
    steps_per_epoch = model_config['train_steps'] // epochs
    
    # Training loop
    print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], model_config['batch_size'], steps_per_epoch)
    print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}") # Pre-training Losses
    for e in range(1, epochs + 1):
        model.fit(dataset_train, optimizer, model_config['batch_size'], steps_per_epoch)
        loss_train, loss_val = model.evaluate([dataset_train, dataset_val], model_config['batch_size'], steps_per_epoch)
        print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}") # Training Losses
    
    # Save Artifacts
    os.makedirs(model_artifacts.path, exist_ok=True)
    with open(model_artifacts.path + '/tiktoken_config.pkl', 'wb') as f:
        pickle.dump(tiktoken_config, f)
    with open(model_artifacts.path + '/model_config.pkl', 'wb') as f:
        pickle.dump(model_config, f)
    model.save(model_artifacts.path + '/gpt.pth', optimizer_state_dict=optimizer.state_dict())


@kfp.dsl.component(base_image=PIPELINE_IMAGE_URI)
def deploy_model(project_id: str, deployment_image: str, model_artifacts: Input[Model], vertex_endpoint: Output[Artifact], vertex_model: Output[Model]):
    import os
    import pickle
    import logging
    
    from google.cloud import aiplatform
    from google.cloud.aiplatform.prediction import LocalModel

    from gcp_predictor import GPTPredictor


    logging.basicConfig(level=logging.DEBUG)
    
    # ERROR: cannot run docker within docker container; build this in deployment.ipynb
    # # https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.prediction.LocalModel
    # local_model = LocalModel.build_cpr_model(
    #     './', # everything here is copied to image
    #     deployment_image, # final output image
    #     predictor=GPTPredictor,
    #     requirements_path="./deploy_requirements.txt",
    # )
    # local_model.push_image()
    
    aiplatform.init(project=project_id) #, location=region)
    model = aiplatform.Model.upload(
        # local_model=local_model,
        display_name="gpt",
        artifact_uri=model_artifacts.uri,
        serving_container_image_uri=deployment_image, # WARNING: make sure image contains up to date source files
    )
    endpoint = model.deploy(machine_type="n1-standard-4") # https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#machine-types
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = model.resource_name

In [None]:
# pipeline definition
@kfp.dsl.pipeline
def pipeline(project_id: str, bucket_name: str, path_model_config: str, path_tiktoken_config: str, path_text_data: str, deployment_image: str):
    # task1 = get_configs(bucket_name=bucket_name, path_model_config=path_model_config, path_tiktoken_config=path_tiktoken_config)
    # task2 = get_data(bucket_name=bucket_name, data_path=data_path)
    task1 = (
        train_model(bucket_name=bucket_name, path_model_config=path_model_config, path_tiktoken_config=path_tiktoken_config, path_text_data=path_text_data)
        .set_cpu_limit('4')
        .set_memory_limit('16G')
        .add_node_selector_constraint('NVIDIA_TESLA_V100') # https://cloud.google.com/compute/docs/gpus#gpus-list
        .set_gpu_limit('1') # https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table
    ) # https://cloud.google.com/vertex-ai/docs/pipelines/machine-types
    task2 = deploy_model(
        project_id=project_id,
        deployment_image=deployment_image,
        model_artifacts=task1.outputs['model_artifacts']
    )

In [None]:
# compile the pipeline
compiler = kfp.compiler.Compiler()
compiler.compile(
    pipeline_func=pipeline, package_path="gpt.yaml"
)

In [None]:
# send it to as a job to vertex ai
job = aiplatform.PipelineJob(
    display_name="gpt",
    template_path="gpt.yaml",
    pipeline_root=f"{BUCKET_URI}/gpt", # where component outputs are stored during pipeline runs
    parameter_values={ # what to pass into kfp.dsl.pipeline arguments
        'project_id': PROJECT_ID,
        'bucket_name': BUCKET_NAME,
        'path_model_config': 'model_artifacts/model_config.pkl',
        'path_tiktoken_config': 'model_artifacts/tiktoken_config.pkl',
        'path_text_data': 'data/shakespeare.txt',
        'deployment_image': DEPLOYMENT_IMAGE_URI,
    },
    enable_caching=False # rerun pipeline tasks each time instead of using cache
)

In [None]:
job.run()

In [None]:
# # Cleanup
# job.delete()
# ! gsutil rm -rf {BUCKET_URI}

https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/kfp2_pipeline.ipynb