In [1]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME") and not os.getenv("VIRTUAL_ENV")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install {USER_FLAG} --upgrade google-cloud-aiplatform -q
! pip3 install {USER_FLAG} --upgrade google-cloud-storage -q
! pip3 install {USER_FLAG} --upgrade pillow -q
! pip3 install {USER_FLAG} --upgrade numpy -q

[0m

In [2]:
PROJECT_ID = "magicleap-1" 

In [3]:
if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [4]:
REGION = "us-west1"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

In [5]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

In [6]:
BUCKET_NAME = "torch-training-ps1"
BUCKET_URI = f"gs://torch-training-ps1"

In [7]:
import os
import sys

from google.cloud import aiplatform

In [8]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [9]:
TRAIN_GPU, TRAIN_NGPU = (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_P100, 1)

DEPLOY_GPU, DEPLOY_NGPU = (aiplatform.gapic.AcceleratorType.NVIDIA_TESLA_P100, 1)

In [10]:
TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest"
DEPLOY_IMAGE = "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest"

In [11]:
MACHINE_TYPE = "n1-standard"

VCPU = "4"
TRAIN_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Train machine type", TRAIN_COMPUTE)

MACHINE_TYPE = "n1-standard"

VCPU = "4"
DEPLOY_COMPUTE = MACHINE_TYPE + "-" + VCPU
print("Deploy machine type", DEPLOY_COMPUTE)

Train machine type n1-standard-4
Deploy machine type n1-standard-4


In [12]:
JOB_NAME = "pytorch_custom_job_" + TIMESTAMP
MODEL_DIR = "{}/{}".format(BUCKET_URI, JOB_NAME)

EPOCHS = 25

CMDARGS = [
    "--epochs=" + str(EPOCHS)
]

In [17]:
%%writefile torchtrain.py

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import time
import os
import copy
from google.cloud import storage
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--epochs', dest='epochs',
                    default=10, type=int,
                    help='Number of epochs.')
args = parser.parse_args()


# create dirs for data
os.makedirs('data/train/ants')
os.makedirs('data/train/bees')
os.makedirs('data/val/ants')
os.makedirs('data/val/bees')

# download and load data
client = storage.Client()
client = storage.Client(project='magicleap-1')
bucket_name = "hymenoptera-data"

train_bucket_ants = "train/ants/"
bucket_md = client.get_bucket(bucket_name)

folder = 'data'

blobs=list(bucket_md.list_blobs(prefix=train_bucket_ants, delimiter='/'))
for blob in blobs:
   destination_uri = '{}/{}'.format(folder, blob.name) 
   blob.download_to_filename(destination_uri)
    
train_bucket_bees = "train/bees/"

train_bee_blobs = list(bucket_md.list_blobs(prefix=train_bucket_bees, delimiter='/'))
for bblob in train_bee_blobs:
    destination_uri2 = '{}/{}'.format(folder, bblob.name)
    bblob.download_to_filename(destination_uri2)

val_bucket_ant = "val/ants/"
val_ant_blobs = list(bucket_md.list_blobs(prefix=val_bucket_ant, delimiter='/'))
for ablob in val_ant_blobs:
    destination_uri3 = '{}/{}'.format(folder, ablob.name)
    ablob.download_to_filename(destination_uri3)
    
val_bucket_bee = "val/bees/"
val_bee_blobs = list(bucket_md.list_blobs(prefix=val_bucket_bee, delimiter='/'))
for cblob in val_bee_blobs:
    destination_uri4 = '{}/{}'.format(folder, cblob.name)
    cblob.download_to_filename(destination_uri4)

print('Data loaded')
    
# data transformation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

print('Data Transformed')

data_dir = 'data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
                                             shuffle=True, num_workers=4)
              for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model


model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2)

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)


model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=25)
print('Training complete')

torch.save(model_ft.state_dict(), 'model.pickle')
print('Model saved to local dir')

# Upload model artifact to Cloud Storage
model_directory = os.environ['AIP_MODEL_DIR']
mstorage_path = os.path.join(model_directory, 'model.pickle')
mblob = storage.blob.Blob.from_string(mstorage_path, client=storage.Client())
mblob.upload_from_filename('model.pickle')


Overwriting torchtrain.py


In [18]:
job = aiplatform.CustomTrainingJob(
    display_name=JOB_NAME,
    script_path="torchtrain.py",
    container_uri=TRAIN_IMAGE,
    model_serving_container_image_uri=DEPLOY_IMAGE,
)

MODEL_DISPLAY_NAME = "pytorch" + TIMESTAMP

# Start the training
if TRAIN_GPU:
    model = job.run(
        model_display_name=MODEL_DISPLAY_NAME,
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_type=TRAIN_GPU.name,
        accelerator_count=TRAIN_NGPU,
    )
else:
    model = job.run(
        model_display_name=MODEL_DISPLAY_NAME,
        args=CMDARGS,
        replica_count=1,
        machine_type=TRAIN_COMPUTE,
        accelerator_count=0,
    )

Training script copied to:
gs://torch-training-ps1/aiplatform-2022-08-09-01:42:28.217-aiplatform_custom_trainer_script-0.1.tar.gz.
Training Output directory:
gs://torch-training-ps1/aiplatform-custom-training-2022-08-09-01:42:28.370 
View Training:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/5750415026942902272?project=563018788117
View backing custom job:
https://console.cloud.google.com/ai/platform/locations/us-west1/training/1462988181686190080?project=563018788117
CustomTrainingJob projects/563018788117/locations/us-west1/trainingPipelines/5750415026942902272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/563018788117/locations/us-west1/trainingPipelines/5750415026942902272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/563018788117/locations/us-west1/trainingPipelines/5750415026942902272 current state:
PipelineState.PIPELINE_STATE_RUNNING
CustomTrainingJob projects/563018788117/locations/u