In [None]:
#
# Copyright 2023 Google LLC
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     https://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Deepspeed Chat on Vertex AI


NOTE: This is an example to test multi-mode training with DeepSpeed on Vertex. 

DeepspeedChat has 3 steps: SFT, Reward Model, and RLHF. We are only calling the SFT step here.

## Setup Container

In [1]:
# Artifact Registry Repo
AR_REPO="llama2"
IMG_NAME="deepspeed-chat"
TAG="vertex"

In [2]:
# Project information
PROJECT_ID="PROJECT_ID"
LOCATION="us-central1"
BUCKET="gs://BUCKET_NAME"
IMAGE_URI=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{AR_REPO}/{IMG_NAME}:{TAG}"

In [7]:
# build dockerfile
EXAMPLE_DIR="deepspeed-chat"
DOCKERFILE=f"examples/{EXAMPLE_DIR}/{IMG_NAME}.Dockerfile"
!echo $DOCKERFILE
!docker build . -t $IMAGE_URI -f $DOCKERFILE

examples/deepspeed-chat/deepspeed-chat.Dockerfile
Sending build context to Docker daemon  4.584MB
Step 1/31 : FROM us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-13.py310:latest
 ---> 78c144ecd81c
Step 2/31 : ENV PYTHONPATH=/opt/conda/lib/python3.10/site-packages:${PYTHONPATH}
 ---> Using cache
 ---> 5cd78fb02482
Step 3/31 : RUN chmod 666 /var/log-storage/output.log
 ---> Using cache
 ---> a19250deeb80
Step 4/31 : ENV STAGE_DIR=/tmp
 ---> Using cache
 ---> 626b4bc63513
Step 5/31 : RUN mkdir -p ${STAGE_DIR}
 ---> Using cache
 ---> 9d1aef5c48c3
Step 6/31 : RUN apt-get update  && apt-get install -y --no-install-recommends  openssh-client openssh-server  dnsutils iputils-ping  net-tools  libaio-dev cmake ninja-build pdsh  && rm -rf /var/lib/apt/lists/*
 ---> Using cache
 ---> 84d3bb091c45
Step 7/31 : ENV SSH_PORT=2222
 ---> Using cache
 ---> a10ccb585bb3
Step 8/31 : COPY config/sshd_config.sed /tmp
 ---> Using cache
 ---> 70245923db33
Step 9/31 : RUN sed -i -E -f /tmp/sshd_config.sed /

In [None]:
# If this throws error
# add "us-docker.pkg.dev": "gcloud" to /home/jupyter/.docker/config.json
!gcloud auth configure-docker

In [8]:
# Make sure the repo specified in $AR_REPO exists.
# Push the docker image
!echo $IMAGE_URI
!docker push $IMAGE_URI

us-central1-docker.pkg.dev/argolis-lsj-test/llama2/deepspeed-chat:1020
The push refers to repository [us-central1-docker.pkg.dev/argolis-lsj-test/llama2/deepspeed-chat]

[1Bb12a657e: Preparing 
[1Bdd9f5d1b: Preparing 
[1Bfb35c5a7: Preparing 
[1Bc835feb3: Preparing 
[1Becc297d5: Preparing 
[1B12f16e66: Preparing 
[1B9d07f065: Preparing 
[1Bee50daf9: Preparing 
[1B2839c42e: Preparing 
[1Bc1a7a815: Preparing 
[1B04c8cac9: Preparing 
[1B9b3321f2: Preparing 
[1B7f13fbf0: Preparing 
[1Bfe2a49dd: Preparing 
[1B64833029: Preparing 
[1B183c9ad8: Preparing 
[1B74c7df37: Preparing 
[1B09d25265: Preparing 
[1B225e8740: Preparing 
[1Bdc2fb310: Preparing 
[1Ba15e9e6d: Preparing 
[1Bb05b4a66: Preparing 
[1B03eb5103: Preparing 
[1B105d38de: Preparing 
[1B6867eca5: Preparing 
[1Beb8da3b6: Preparing 
[1B3741a401: Preparing 
[1B4314a1a9: Preparing 
[1Be1a4db2c: Preparing 
[1B2d93004e: Preparing 
[1Bc5d23056: Preparing 
[1B988466f1: Preparing 
[1Ba520fb4d: Preparing 
[1B1

## Test container with aiplatform.CustomJob

In [3]:
from datetime import datetime
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET, location=LOCATION)

### Configure the custom job



- DATA_PATHS - dataset name in huggingface or GCS path
- MODEL_PATH - model name in huggingface or GCS path
- DATA_SPLIT - "10,40,50" means 10% of the data is used for SFT. The DeepspeedChat code converts the string into fractions (data_utils.py). We do not demo it in this notebook.
- ZERO_STAGE - ZERO stage
- PER_DEVICE_BATCH_SIZE - training batch size

The demo shows how to fine-tune Llama2-7b-hf and samsum dataset using SFT(*train_deepspeed_sft.sh*) or LoRA (*train_deepspeed_lora.sh*). To be competable with samsum dataset and users' customized datasets, the samsum interface and customized dataset interface are implemented in *third_party/utils/data/data_utils.py* and *raw_datasets.py*.

Model uploaded to Vertex AI Model Regitry is also implemented in *main.py* and *model_registry.py*. If *serving_container_image_uri* is not None, the registration process will start.

In [4]:
# configure machine specs, and environment variables 
worker_pool_specs = [
    # `WorkerPoolSpec` for worker pool 0, primary replica, required  
    {
        "machine_spec": {
            "machine_type": "g2-standard-96",
            "accelerator_type": "NVIDIA_L4",
            "accelerator_count": 8,       
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": IMAGE_URI,
            "command": [],
            "args": [],
            "env": [
                {"name": "MODEL_PATH", "value": "/gcs/deepspeed_repo/base_model/Llama-2-7b-hf/Llama-2-7b-hf"},                        
                {"name": "DATA_PATHS", "value": "/gcs/deepspeed_repo/dataset/samsum"},
                {"name": "DATA_SPLIT", "value": "10,0,0"},                      
                {"name": "ZERO_STAGE", "value": "3"},
                {"name": "PER_DEVICE_BATCH_SIZE", "value": "4"},
            ],                
        },
        "disk_spec": {
            "boot_disk_size_gb": 1000,            
        }
    },
    
    # {
    #    "machine_spec": {
    #         "machine_type": "g2-standard-48",
    #         "accelerator_type": "NVIDIA_L4",
    #         "accelerator_count": 4,           
    #    },
    #    "replica_count": 1,        
    #    "container_spec": {
    #        "image_uri": IMAGE_URI,
    #        "command": [],
    #         "args": [],
    #         "env": [
    #             {"name": "MODEL_PATH", "value": "/gcs/deepspeed_repo/base_model/Llama-2-7b-hf/Llama-2-7b-hf"},                        
    #             {"name": "DATA_PATHS", "value": "samsum"},
    #             {"name": "DATA_SPLIT", "value": "10,0,0"},                        
    #             {"name": "ZERO_STAGE", "value": "3"},
    #             {"name": "PER_DEVICE_BATCH_SIZE", "value": "4"}, 
    #         ],
    #    },        
    #    "disk_spec": {
    #         "boot_disk_size_gb": 1000,            
    #    }        
    # },
]

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
JOB_NAME  = "DeepSpeed Chat Test " + TIMESTAMP

my_job = aiplatform.CustomJob(
    display_name=JOB_NAME,    
    worker_pool_specs=worker_pool_specs,
)

# Checking Service account that will launch the job
!gcloud config get account

703099487153-compute@developer.gserviceaccount.com


In [5]:
#####
# Either create or reuse a tensorboard
# tensorboard = aiplatform.Tensorboard.create(
#    display_name=JOB_NAME,
# )
# 
tensorboard_name = "VERTEX AI TENSORBOARD ID"
tensorboard = aiplatform.Tensorboard(tensorboard_name=tensorboard_name)
# 
print(tensorboard.resource_name)

projects/703099487153/locations/us-central1/tensorboards/2069606338916253696


### Running the CustomJob

Custom Service Account - https://cloud.google.com/vertex-ai/docs/general/custom-service-account. For custom service account, be sure to first grant the SA running this notebook the "Service Account User" role, otherwise you won't be able to launch the job with the custom service account.

Tensorboard - https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training. Your training script must be configured to write TensorBoard logs to the Cloud Storage bucket, the location of which the Vertex AI Training Service will automatically make available through a predefined environment variable AIP_TENSORBOARD_LOG_DIR.

In [6]:

my_job.submit(    
    enable_web_access=True, # For debugging
    service_account="SERVICE ACCOUNT NAME",
    tensorboard=tensorboard.resource_name,
)

Creating CustomJob
CustomJob created. Resource name: projects/703099487153/locations/us-central1/customJobs/2406458012601417728
To use this CustomJob in another session:
custom_job = aiplatform.CustomJob.get('projects/703099487153/locations/us-central1/customJobs/2406458012601417728')
View Custom Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/training/2406458012601417728?project=703099487153
View Tensorboard:
https://us-central1.tensorboard.googleusercontent.com/experiment/projects+703099487153+locations+us-central1+tensorboards+2069606338916253696+experiments+2406458012601417728


While the Deepspeed Chat team has auto-tuning on roadmap, if you encounter CUDA OOM right now their advice is:
- Reduce `--per_device_*_batch_size`,
- Increase `--zero_stage {0,1,2,3}` on multi-gpu setups,
- Enable `--gradient_checkpointing` or `--only_optimize_lora`,
- Increase `--gradient_accumulate_steps {#}`, higher number reduces communication of gradients between steps

# Configurations backup

Testing facebook/opt-125m and Dahoas/synthetic-instruct-gptj-pairwise with 2 1xT4@n1-standard-4:

PER_DEVICE_BATCH_SIZE - 8 will utilize < half a T4's memory on each of the 2 nodes, 32 uses the memory 70+%

In [None]:
# opt-125m backup
{"name": "MODEL_PATH", "value": "facebook/opt-125m"},                        
{"name": "DATA_PATHS", "value": "Dahoas/synthetic-instruct-gptj-pairwise"},                        
{"name": "DATA_SPLIT", "value": "10,40,50"},
{"name": "ZERO_STAGE", "value": "3"},
{"name": "PER_DEVICE_BATCH_SIZE", "value": "32"},

In [None]:
# llama2 backup
# If download llama2 from huggingface, copy your huggingface read token in token file first.
{"name": "MODEL_PATH", "value": "meta-llama/Llama-2-7b-hf"},                        
{"name": "DATA_PATHS", "value": "Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets"},                        
{"name": "DATA_SPLIT", "value": "2,4,4"},
{"name": "ZERO_STAGE", "value": "3"},
{"name": "PER_DEVICE_BATCH_SIZE", "value": "4"}, 