### How to Create Base Image for Components

---
Skip if unneeded:

0. `gcloud init`
1. `gcloud config set project <project-name>`
2. `gcloud compute ssh --zone us-central1-a <bucket-name> --internal-ip`
3. `gcloud auth configure-docker us-central1-docker.pkg.dev` # (https://cloud.google.com/artifact-registry/docs/docker/authentication)
---

1. `vim requirements1.txt` and paste:
```
--index-url <can specify different index>
kfp
```

2. `vim requirements2.txt` and paste:
```
torch
torchvision
torchaudio

```

3. `vim Dockerfile` and paste:
```
# https://cloud.google.com/vertex-ai/docs/predictions/pre-built-containers
FROM us-docker.pkg.dev/vertex-ai/prediction/pytorch-gpu.2-1:latest
COPY requirements1.txt requirements2.txt ./
RUN python -m pip install --upgrade pip -r requirements1.txt 
RUN python -m pip install -r requirements2.txt
COPY ./gpt/attention.py ./gpt/model.py ./gpt/dataset.py ./
# WARNING: if you do `pip install -r r1.txt -r r2.txt` it will the last arg's --index-url for both
```

4. Run
```
Docker build --no-cache .
Docker tag <image-name> "<name>:<tag>"
Docker push "<name>:<tag>"
```

In [None]:
# GCP initialization
PROJECT_ID = "<project-id>"
! gcloud config set project {PROJECT_ID}
! gcloud projects describe  $PROJECT_ID
REGION = "us-central1"
BUCKET_URI = "gs://<bucket-name>"
! gsutil mb -l {REGION} -p {PROJECT_ID} {BUCKET_URI}
SERVICE_ACCOUNT = "<12-digit-number>-compute@developer.gserviceaccount.com"
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectCreator $BUCKET_URI
! gsutil iam ch serviceAccount:{SERVICE_ACCOUNT}:roles/storage.objectViewer $BUCKET_URI
BASE_IMAGE = "<name>:<tag>"

In [None]:
import kfp
from kfp.dsl import Input, Output, Dataset, Model
from google.cloud import aiplatform

In [None]:
# initialize the Vertex AI SDK for your project and bucket
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [None]:
# component definitions
@kfp.dsl.component(base_image=BASE_IMAGE) # TODO: replace with lightweight base_image
def get_config(config: dict) -> dict:
    # TODO: read config from yaml
    return config

@kfp.dsl.component(base_image=BASE_IMAGE)
def get_data(bucket_data_path: str, data: Output[Dataset]):
    from google.cloud import storage
    assert bucket_data_path[:5] == 'gs://'
    s = bucket_data_path.split('/')
    bucket_name = s[2]
    path = '/'.join(s[3:])
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(path)
    blob.download_to_filename(data.path)
    

@kfp.dsl.component(base_image=BASE_IMAGE)
def train_model(config: dict, data: Input[Dataset]):
    import torch
    
    from dataset import CharacterDataset
    dataset_shakespeare = CharacterDataset(data.path, seq_len=config['sequence_dim'])
    n = int(.9*len(dataset_shakespeare))
    data_train = torch.utils.data.Subset(dataset_shakespeare, list(range(0, n)))
    data_val = torch.utils.data.Subset(dataset_shakespeare, list(range(n, len(dataset_shakespeare))))
    dl_train = torch.utils.data.DataLoader(data_train, batch_size=config['batch_size'], shuffle=True)
    dl_val = torch.utils.data.DataLoader(data_val, batch_size=config['batch_size'], shuffle=True)
    
    from model import GPT
    model = GPT(
        dataset_shakespeare.vocab_dim,
        config['sequence_dim'],
        config['embed_dim'],
        config['num_heads'],
        config['num_layers'],
        dropout=config['dropout']
    ).to(config['device'])
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])
    
    @torch.no_grad()
    def estimate_loss(model, iters, device):
        out = []
        model.to(device)
        model.eval()
        losses = torch.zeros(iters)
        for dataloader in [dl_train, dl_val]:
            i = 0
            for x, y in dataloader:
                x = x.to(device)
                y = y.to(device)
                logits = model(x)
                loss = model.get_loss(logits, y)
                losses[i] = loss.item()
                i += 1
                if i >= iters - 1:
                    break
            out.append(losses.mean())
        model.train()
        return out
    
    model.train()
    tenth = config['train_steps']//10
    iter_dl_train = iter(dl_train)
    for steps in range(config['train_steps']):
        x, y = next(iter_dl_train)
        x = x.to(config['device'])
        y = y.to(config['device'])
        logits = model(x)
        loss = model.get_loss(logits, y)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        if steps % tenth == 0:
            train_loss, val_loss = estimate_loss(model, 100, config['device'])
            print(train_loss, val_loss)


In [None]:
# pipeline definition
@kfp.dsl.pipeline
def pipeline(config: dict, bucket_data_path: str):
    task1 = get_config(config=config)
    task2 = get_data(bucket_data_path=bucket_data_path)
    task3 = (
        train_model(config=task1.output, data=task2.outputs['data'])
        .set_cpu_limit('4')
        .set_memory_limit('16G')
        .add_node_selector_constraint('NVIDIA_TESLA_V100') # https://cloud.google.com/compute/docs/gpus#gpus-list
        .set_gpu_limit('1') # https://cloud.google.com/vertex-ai/docs/training/configure-compute#gpu-compatibility-table
    ) # https://cloud.google.com/vertex-ai/docs/pipelines/machine-types
    return

In [None]:
# compile the pipeline
compiler = kfp.compiler.Compiler()
compiler.compile(
    pipeline_func=pipeline, package_path="gpt.yaml"
)

In [None]:
config = dict(
    batch_size = 16, # N
    sequence_dim = 10, # L, S
    embed_dim = 13, # E
    num_heads = 1, # H
    num_layers = 1,
    dropout = 0.2,
    train_steps = 1000,
    lr = 1e-3, # learning rate
    seed = 78,
    device = 'cuda',
)
# assert embed_dim % num_heads == 0
# torch.manual_seed(78) # TODO

In [None]:
# send it to as a job to vertex ai
# TODO: research https://cloud.google.com/vertex-ai/docs/training/create-custom-job
job = aiplatform.PipelineJob(
    display_name="gpt",
    template_path="gpt.yaml",
    pipeline_root=f"{BUCKET_URI}/gpt", # where component outputs are stored during pipeline runs
    parameter_values={ # what to pass into kfp.dsl.pipeline arguments
        'config': config,
        'bucket_data_path': f"{BUCKET_URI}/data/shakespeare.txt"
    },
    enable_caching=False # rerun pipeline tasks each time instead of using cache
)

In [None]:
job.run()

In [None]:
# Example to learn more:
# https://www.kubeflow.org/docs/components/pipelines/v2/components/containerized-python-components/