### How to Create Base Image for Components

---
Skip if unneeded:

0. `gcloud init`
1. `gcloud config set project <project-name>`
2. `gcloud compute ssh --zone us-central1-a <bucket-name> --internal-ip`
3. `gcloud auth configure-docker us-central1-docker.pkg.dev` # (https://cloud.google.com/artifact-registry/docs/docker/authentication)
---

1. `vim requirements1.txt` and paste:
```
--index-url <can specify different index>
kfp
google-cloud-aiplatform
google-cloud-storage
```

2. `vim requirements2.txt` and paste:
```
torch
torchvision
torchaudio
torchserve
torch-model-archiver
torch-workflow-archiver
tiktoken
```

Versions at time of writing:
```raw
kfp                                      2.4.0
kfp-pipeline-spec                        0.2.2
kfp-server-api                           2.0.3
torch                                    2.1.1
torch-model-archiver                     0.9.0
torch-workflow-archiver                  0.2.11
torch-xla                                2.0
torchaudio                               2.1.1
torchserve                               0.9.0
torchvision                              0.16.1
```


3. `vim Dockerfile` and paste:
```
# https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
FROM us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.2-1:latest
COPY requirements1.txt requirements2.txt ./
RUN python -m pip install --upgrade pip -r requirements1.txt 
RUN python -m pip install -r requirements2.txt
COPY ./gpt/*.py ./
# WARNING: if you do `pip install -r r1.txt -r r2.txt` it will the last arg's --index-url for both
```

4. Run
```
Docker build --no-cache .
Docker tag <image-name> "<name>:<tag>"
Docker push "<name>:<tag>"
```

In [None]:
# GCP initialization
PROJECT_ID = "<project-id>"
! gcloud config set project {PROJECT_ID}
! gcloud projects describe  $PROJECT_ID
REGION = "us-central1"
BUCKET_URI = "gs://<bucket-name>"
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}
SERVICE_ACCOUNT = "<12-digit-number>-compute@developer.gserviceaccount.com"
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
BASE_IMAGE = "<name>:<tag>"

In [None]:
import kfp
from kfp.dsl import Input, Output, Dataset, Model, Artifact # https://www.kubeflow.org/docs/components/pipelines/v2/data-types/artifacts/
from google.cloud import aiplatform

In [None]:
# initialize the Vertex AI SDK for your project and bucket
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
# component definitions
# https://www.kubeflow.org/docs/components/pipelines/v2/components/containerized-python-components/

@kfp.dsl.component(base_image=BASE_IMAGE) # TODO: replace with lightweight base_image
def get_config(config: dict) -> dict:
    assert config['embed_dim'] % config['num_heads'] == 0
    # TODO: read config from yaml
    return config

@kfp.dsl.component(base_image=BASE_IMAGE)
def get_data(bucket_data_path: str, artifact_data: Output[Dataset]):
    from google.cloud import storage
    assert bucket_data_path[:5] == 'gs://'
    s = bucket_data_path.split('/')
    bucket_name = s[2]
    path = '/'.join(s[3:]) # gs://lh-sandbox/data/shakespear.txt
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    blob.download_to_filename(artifact_data.path)
    
@kfp.dsl.component(base_image=BASE_IMAGE)
def train_model(config: dict, artifact_data: Input[Dataset], artifact_model_pth: Output[Model]):
    import os
    import torch
    
    torch.manual_seed(config['seed']) # for reproducible experiments; but may slow down model
    
    from dataset import CharacterDataset
    dataset_shakespeare = CharacterDataset(artifact_data.path, seq_len=config['sequence_dim'])
    n = int(.95*len(dataset_shakespeare))
    dataset_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
    dataset_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))
    
    from model import GPT
    model = GPT(
        dataset_shakespeare.vocab_dim,
        config['sequence_dim'],
        config['embed_dim'],
        config['num_heads'],
        config['num_layers'],
        dropout=config['dropout'],
        device=config['device'],
    )
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
    epochs = 10
    steps_per_epoch = config['train_steps'] // epochs
    print(f'{"Epoch":^5} | {"Train Loss":^10} | {"Val Loss":^10}')
    # Pre-training
    loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
    print(f"{0:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")
    # Training
    for e in range(1, epochs + 1):
        model.fit(dataset_train, optimizer, config['batch_size'], steps_per_epoch)
        loss_train, loss_val = model.evaluate([dataset_train, dataset_val], config['batch_size'], steps_per_epoch)
        print(f"{e:^5} | {loss_train:>10.3f} | {loss_val:>10.3f}")
    # Save
    os.makedirs(artifact_model_pth.path, exist_ok=True)
    torch.save(model, artifact_model_pth.path + "/model.pth")

# https://cloud.google.com/vertex-ai/docs/training/exporting-model-artifacts

# @kfp.dsl.container_component
# def prep_deployment(artifact_model_pth: Input[Model], artifact_model_mar: Output[Model]):
#     # Note: python code doesn't run here
#     return kfp.dsl.ContainerSpec(
#         image=BASE_IMAGE,
#         command=[
#             'sh', '-c', '''
#             echo $1\
#             && echo $2\
#             && mkdir -p $1\
#             && echo hello world\ 
#             && torch-model-archiver --model-name model --version 0.1 --serialized-file $2 --handler deployment_handler.py --export-path $1
#             '''
#         ],
#         args=[str(artifact_model_mar.path), str(artifact_model_pth.path) + '/model.pth']
#         # WARNING: invalid arguments will be skipped w/o error message!; $2 becomes $1, $3 becomes $2
#     )

@kfp.dsl.component(base_image=BASE_IMAGE)
def prep_deployment(artifact_model_pth: Input[Model], artifact_model_mar: Output[Model]):
    import subprocess
    import os
    os.makedirs(artifact_model_mar.path, exist_ok=True)
    cmd = f"torch-model-archiver\
        --model-name model\
        --version 0.1\
        --serialized-file {str(artifact_model_pth.path) + '/model.pth'}\
        --handler deployment_handler.py\
        --export-path {str(artifact_model_mar.path)}"
    subprocess.run([cmd], shell=True)

@kfp.dsl.component(base_image=BASE_IMAGE)
def deploy_model(project_id: str, image: str, artifact_model_mar: Input[Model], artifact_vertex_endpoint: Output[Artifact], artifact_vertex_model: Output[Model]):
    from google.cloud import aiplatform
    aiplatform.init(project=project_id)
    deployed_model = aiplatform.Model.upload(
        display_name="gpt",
        artifact_uri=artifact_model_mar.uri,
        serving_container_image_uri=image,
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4") # https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#machine-types
    artifact_vertex_endpoint.uri = endpoint.resource_name
    artifact_vertex_model.uri = deployed_model.resource_name

In [None]:
config_gcp = dict(
    project_id = PROJECT_ID,
    bucket_data_path = f"{BUCKET_URI}/data/shakespeare.txt",
    image = BASE_IMAGE,
    cpu_limit = '4',
    memory_limit = '16G',
    node_selector_constraint = 'NVIDIA_TESLA_V100',
    gpu_limit = '1',
)

In [None]:
# pipeline definition
@kfp.dsl.pipeline
def pipeline(config_model: dict, bucket_data_path: str, project_id: str, image: str):
    task1 = get_config(config=config_model)
    task2 = get_data(bucket_data_path=bucket_data_path)
    task3 = (
        train_model(config=task1.output, artifact_data=task2.outputs['artifact_data'])
        .set_cpu_limit('4')
        .set_memory_limit('16G')
        .add_node_selector_constraint('NVIDIA_TESLA_V100') # https://cloud.google.com/compute/docs/gpus#gpus-list
        .set_gpu_limit('1') # https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table
    ) # https://cloud.google.com/vertex-ai/docs/pipelines/machine-types
    task4 = prep_deployment(artifact_model_pth=task3.outputs['artifact_model_pth'])
    task5 = deploy_model(
        project_id=project_id,
        image=image,
        artifact_model_mar=task4.outputs['artifact_model_mar']
    )

In [None]:
# compile the pipeline
compiler = kfp.compiler.Compiler()
compiler.compile(
    pipeline_func=pipeline, package_path="gpt.yaml"
)

In [None]:
config_model = dict(
    batch_size = 64, # N
    sequence_dim = 100, # L, S
    embed_dim = 78, # E
    num_heads = 13, # H
    num_layers = 3,
    dropout = 0.2,
    train_steps = 5000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)

In [None]:
# send it to as a job to vertex ai
# TODO: research https://cloud.google.com/vertex-ai/docs/training/create-custom-job
job = aiplatform.PipelineJob(
    display_name="gpt",
    template_path="gpt.yaml",
    pipeline_root=f"{BUCKET_URI}/gpt", # where component outputs are stored during pipeline runs
    parameter_values={ # what to pass into kfp.dsl.pipeline arguments
        'config_model': config_model,
        'bucket_data_path': f"{BUCKET_URI}/data/shakespeare.txt",
        'project_id': PROJECT_ID,
        'image': BASE_IMAGE,
    },
    enable_caching=False # rerun pipeline tasks each time instead of using cache
)

In [None]:
job.run()

In [None]:
# # Cleanup
# job.delete()
# ! gsutil rm -rf {BUCKET_URI}

In [None]:
# Example to learn more:
# https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/official/pipelines/kfp2_pipeline.ipynb