1. Init scirpt parameters

In [None]:
# Set SingleNode Training or MultiNode Training
MULTI_NODE = True

In [None]:
# Input your project id 
PROJECT_ID = <PROJECT_ID>  # @param {type:"string"}

In [None]:
# Input your GCP resource region
REGION = <REGION>  # @param {type: "string"}

In [None]:
# Input your cloud bucket URI, example: gcs://<bucketname>
BUCKET_URI = ""  # @param {type:"string"}

In [None]:
# Input your custom training image uri
IMAGE_URI = "" # @param {type:"string"}

In [None]:
# Input Vertex AI tensorboard display name
TENSORBOARD_DISPLAY_NAME = "llama2_tensorboard" # @param {type:"string"}

In [None]:
# Input service account, please create the service acccount before the training
SVC_ACCOUNT = "" # @param {type:"string"}

2. Create Service Account(Skip this section if you has existing service account)

In [None]:
service_account_name = "llama2-finetune"

In [None]:
import os

from google.oauth2 import service_account  # type: ignore
import googleapiclient.discovery  # type: ignore

def create_service_account(project_id: str, name: str, display_name: str) -> str:
    """Creates a service account."""

    credentials = service_account.Credentials.from_service_account_file(
        filename=os.environ["GOOGLE_APPLICATION_CREDENTIALS"],
        scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )

    service = googleapiclient.discovery.build("iam", "v1", credentials=credentials)

    my_service_account = (
        service.projects()
        .serviceAccounts()
        .create(
            name="projects/" + project_id,
            body={"accountId": name, "serviceAccount": {"displayName": display_name}},
        )
        .execute()
    )

    print("Created service account: " + my_service_account["email"])
    return my_service_account["email"]

In [None]:
SVC_ACCOUNT = create_service_account(project_id=PROJECT_ID, name=service_account_name, display_name=service_account_name)

3. Create Cloud Bucket(Skip this section if you has existing Cloud Bucket)

In [None]:
bucket_name = f"llama2-finetune-bucket-{PROJECT_ID}"

In [None]:
from google.cloud import storage


def create_bucket_class_location(bucket_name):
    """
    Create a new bucket in the US region with the coldline storage
    class
    """
    # bucket_name = "your-new-bucket-name"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)
    bucket.storage_class = "NEARLINE"
    new_bucket = storage_client.create_bucket(bucket, location=REGION)

    print(
        "Created bucket {} in {} with storage class {}".format(
            new_bucket.name, new_bucket.location, new_bucket.storage_class
        )
    )
    return new_bucket

In [None]:
BUCKET_URI = "gs://" + create_bucket_class_location(bucket_name=bucket_name)

4. Create Vertex AI tensorboard instance

In [None]:
# Create tensorboard instance
from google.cloud import aiplatform
def create_tensorboard_sample(
    project: str,
    display_name: str,
    location: str,
):
    aiplatform.init(project=project, location=location)

    tensorboard = aiplatform.Tensorboard.create(
        display_name=display_name,
        project=project,
        location=location,
    )

    print(tensorboard.resource_name)
    return tensorboard.resource_name

In [None]:
# input Vertex AI tensorboard instance id, example projects/<project_number>/locations/<region>/tensorboards/<tensorboard_instance_id>
# TENSORBOARD= <TENSORBOARD_INSTANCE_ID> # @param {type:"string"}
TENSORBOARD = create_tensorboard_sample(project=PROJECT_ID, display_name=TENSORBOARD_DISPLAY_NAME, location=REGION)

5. Build custom training image

6. Create Vertex AI custome training JOB

In [None]:
# Input cluster spec parameters
HEAD_NODE_ACCELERATOR_COUNT = 1
HEAD_NODE_COUNT = 1

# If multinode training, set worker node spec parameters
WORKER_NODE_ACCELERATOR_COUNT = 1
WORKER_NODE_COUNT = 1

In [None]:
# Setup python package dependacy
from datetime import datetime
from google.cloud import aiplatform

# init aiplaform sdk
aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_URI, location=REGION)

In [None]:
# Execute this cell to launch an SingleNode training
if MULTI_NODE == False:
    worker_pool_specs = [
        # `WorkerPoolSpec` for worker pool 0, primary replica, required  
        {
            "machine_spec": {
                "machine_type": "a2-highgpu-1g", # "a2-highgpu-1g","g2-standard-12","n1-standard-4"
                "accelerator_type": "NVIDIA_TESLA_A100", #"NVIDIA_L4",
                "accelerator_count": HEAD_NODE_ACCELERATOR_COUNT,       
            },
            "replica_count": HEAD_NODE_COUNT,
            "container_spec": {
                "image_uri": IMAGE_URI,
                "command": [],
                "args": [],
                "env": [],                
            },
            "disk_spec": {
                "boot_disk_size_gb": 1000,            
            }
        },
    ]

    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    JOB_NAME  = "llama2finetunecustomjob " + TIMESTAMP

    my_job = aiplatform.CustomJob(
        display_name=JOB_NAME,    
        worker_pool_specs=worker_pool_specs,
        base_output_dir=BUCKET_URI,
    )

In [None]:
# Execute this cell to launch an MultiNode training
if MULTI_NODE == True:
    worker_pool_specs = [
        # `WorkerPoolSpec` for worker pool 0, primary replica, required  
        {
            "machine_spec": {
                "machine_type": "g2-standard-12", # "a2-highgpu-1g","g2-standard-12","n1-standard-4"
                "accelerator_type": "NVIDIA_TESLA_A100", #"NVIDIA_L4",
                "accelerator_count": HEAD_NODE_ACCELERATOR_COUNT,       
            },
            "replica_count": HEAD_NODE_COUNT,
            "container_spec": {
                "image_uri": IMAGE_URI,
                "command": [],
                "args": [],
                "env": [],                
            },
        },
        {
        "machine_spec": {
                "machine_type": "g2-standard-12", # "a2-highgpu-1g","g2-standard-12","n1-standard-4"
                "accelerator_type": "NVIDIA_TESLA_A100", #"NVIDIA_L4",
                "accelerator_count": WORKER_NODE_ACCELERATOR_COUNT,          
        },
        "replica_count": WORKER_NODE_COUNT,        
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": [],
                "args": [],
                "env": [],
        },        
        "disk_spec": {
                "boot_disk_size_gb": 1000,            
        }        
        },
    ]

    TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
    JOB_NAME  = "llama2finetunecustomjob " + TIMESTAMP

    my_job = aiplatform.CustomJob(
        display_name=JOB_NAME,    
        worker_pool_specs=worker_pool_specs,
        base_output_dir=BUCKET_URI,
    )

In [None]:
# Submit Vertex AI custom trainig JOB
my_job.submit(    
    enable_web_access=True, # For debugging
    service_account=SVC_ACCOUNT,
    tensorboard=TENSORBOARD,
)