In [1]:
! pip3 install --upgrade google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Obtaining dependency information for google-cloud-aiplatform from https://files.pythonhosted.org/packages/f9/00/13c8a1c052d8205875c888281a34ea10a181d5ca7b6d2003c28fb1da1a03/google_cloud_aiplatform-1.36.4-py2.py3-none-any.whl.metadata
  Downloading google_cloud_aiplatform-1.36.4-py2.py3-none-any.whl.metadata (27 kB)
Downloading google_cloud_aiplatform-1.36.4-py2.py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: google-cloud-aiplatform
  Attempting uninstall: google-cloud-aiplatform
    Found existing installation: google-cloud-aiplatform 1.31.0
    Uninstalling google-cloud-aiplatform-1.31.0:
      Successfully uninstalled google-cloud-aiplatform-1.31.0
Successfully installed google-cloud-aiplatform-1.36.4


In [29]:
import json
import os
import sys
import requests
import time
from datetime import datetime

from google.cloud import aiplatform, language, storage, aiplatform_v1
from google.auth.transport.requests import Request
import google.auth
from google.api_core import operations_v1
from google.longrunning import operations_pb2

In [30]:
# Cloud project id.
PROJECT_ID = "project-kangwe-poc"  # @param {type:"string"}

# Region for launching jobs.
REGION = "us-central1"  # @param {type:"string"}

# Cloud Storage bucket for storing experiments output.
# Start with gs:// prefix, e.g. gs://foo_bucket.
BUCKET_URI = "gs://llama2ft-project-kangwe-poc-unique"  # @param {type:"string"}

! gcloud config set project $PROJECT_ID
! gcloud services enable language.googleapis.com


STAGING_BUCKET = os.path.join(BUCKET_URI, "temporal")
EXPERIMENT_BUCKET = os.path.join(BUCKET_URI, "peft")
DATA_BUCKET = os.path.join(EXPERIMENT_BUCKET, "data")
BASE_MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "base_model")
MODEL_BUCKET = os.path.join(EXPERIMENT_BUCKET, "model")
PREDICTION_BUCKET = os.path.join(EXPERIMENT_BUCKET, "prediction")

# The service account looks like:
# '@.iam.gserviceaccount.com'
# Please go to https://cloud.google.com/iam/docs/service-accounts-create#iam-service-accounts-create-console
# and create service account with `Vertex AI User` and `Storage Object Admin` roles.
# The service account for deploying fine tuned model.
SERVICE_ACCOUNT = "llamafinetune@project-kangwe-poc.iam.gserviceaccount.com"  # @param {type:"string"}

Updated property [core/project].


In [31]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

In [32]:
# The pre-built training, serving and evaluation docker images.
VLLM_DOCKER_URI = "us-central1-docker.pkg.dev/project-kangwe-poc/llmserve/vllm-runtime:1.0"

In [33]:
base_model_name = "llama2-7b-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]
base_model_id = os.path.join(BUCKET_URI, base_model_name)
print(base_model_id)

gs://llama2ft-project-kangwe-poc-unique/llama2-7b-hf


In [35]:
def get_job_name_with_datetime(prefix: str) -> str:
    """Gets the job name with date time when triggering training or deployment
    jobs in Vertex AI.
    """
    return prefix + datetime.now().strftime("_%Y%m%d_%H%M%S")

def upload_model_vllm(
    model_name: str,
) -> aiplatform.Model:
    # start upload model
    model_endpoint_url = f"https://{REGION}-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{REGION}/models:upload"

    model_container_spec = {
      "imageUri": VLLM_DOCKER_URI,
      "command": [
          "python3",
          "/root/scripts/launcher.py"
        ],
      "args": [
          "--host=0.0.0.0",
          "--port=7080",
          f"--model={base_model_id}",
          "--tensor-parallel-size=2",
          "--swap-space=16"
      ],
      "ports": [
        {
          "containerPort": 7080
        }
      ],
      "predictRoute": "/generate",
      "healthRoute": "/ping",
      "sharedMemorySizeMb": "6000"
    }

    model_source_info = {
        "sourceType": "CUSTOM"
    }

    model_infor = {
      "displayName": f"{model_name}",
      "containerSpec": model_container_spec,
      "modelSourceInfo": model_source_info
    }

    model_request = {
      "modelId": f"{model_name}",
      "model": model_infor
    }

    # Get the default credentials
    credentials, project = google.auth.default()

    # Request an access token
    credentials.refresh(Request())

    # Get the access token
    access_token = credentials.token

    # Init Model upload request header
    headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
        }

    response = requests.post(model_endpoint_url, headers=headers, data=json.dumps(model_request))
    
    # parse the JSON response
    data = response.json()

    # extract the operation ID
    operation_name = data['name']
    
    request = google.auth.transport.requests.Request()
    channel = google.auth.transport.grpc.secure_authorized_channel(
            credentials, request, f"{REGION}-aiplatform.googleapis.com")
    
    # create an operations client
    client = operations_v1.OperationsClient(channel=channel)

    # check if the operation is done
    while(True):
        # wait for 30 secs
        time.sleep(30)
        
        # get the operation
        operation = client.get_operation(operation_name)
        
        if operation.done:
            if operation.HasField('response'):
                print('Operation completed successfully')
                # you can access the response via operation.response
                break
            elif operation.HasField('error'):
                print('Operation failed')
                # you can access the error message via operation.error.message
                raise Exception(f"This is error when upload model: {operation.error.message}")
        else:
            print('Operation still in progress')
    
    # Define your model name
    model_name = f"projects/{PROJECT_ID}/locations/{REGION}/models/{model_name}"
            
    return aiplatform.Model(model_name=model_name)

In [36]:
def deploy_model_vllm(
    model_name: str,
    model_id: str,
    service_account: str,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys trained models with vLLM into Vertex AI."""
    endpoint = aiplatform.Endpoint.create(display_name=f"{model_name}-endpoint")

    vllm_args = [
        "--host=0.0.0.0",
        "--port=7080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        # "--gpu-memory-utilization=0.9",
        # "--max_num_batched_tokens=4096",
        # "--disable-log-stats",
    ]
    
#     vllm_args = [
#         f"--tensor_parallel_size={accelerator_count}",
#         f"--model_gcs_uri={model_id}"
#     ]
    
    # model = aiplatform.Model.upload(
    #     display_name=model_name,
    #     serving_container_image_uri=VLLM_DOCKER_URI,
    #     # serving_container_command=["python3", "/root/scripts/launcher.py"],
    #     serving_container_command=["python", "-m", "vllm.entrypoints.api_server"],
    #     serving_container_args=vllm_args,
    #     serving_container_ports=[7080],
    #     serving_container_predict_route="/generate",
    #     serving_container_health_route="/ping",
    # )
     
    model = upload_model_vllm(model_name=model_name)

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return model, endpoint

In [None]:
# Finds Vertex AI prediction supported accelerators and regions in
# https://cloud.google.com/vertex-ai/docs/predictions/configure-compute.

# Sets 1 V100 (16G) to deploy LLaMA2 7B models.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-8"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 1

# Sets 1 L4 (24G) to deploy LLaMA2 7B models.
# L4 serving is more cost efficient than V100 serving.
# machine_type = "g2-standard-8"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 1

# If A100 is not available, you may serve LLaMA2 13B models with multiple V100s
# or L4s. Please keep in mind that the efficiency of serving with multiple
# V100s or L4s is inferior to serving with 1 A100.
# Sets 2 V100 (16G) to deploy LLaMA2 13B models.
# V100 serving has better throughput and latency performance than L4 serving.
# machine_type = "n1-standard-16"
# accelerator_type = "NVIDIA_TESLA_V100"
# accelerator_count = 2

# Sets 2 L4 (24G) to deploy LLaMA2 13B models.
# L4 serving is more cost efficient than V100 serving.
machine_type = "g2-standard-24"
accelerator_type = "NVIDIA_L4"
accelerator_count = 2

# Sets A100 (40G) to deploy LLaMA2 13B models.
# machine_type = "a2-highgpu-1g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 1

# Sets 8 L4 (24G) to deploy LLaMA2 70B models.
# If you do not have access to 4 A100 (40G) GPUs, you may serve LLaMA 2 70B
# models with 8 L4 (24G) GPUs.
# machine_type = "g2-standard-96"
# accelerator_type = "NVIDIA_L4"
# accelerator_count = 8

# Sets 4 A100 (40G) to deploy LLaMA2 70B models.
# machine_type = "a2-highgpu-4g"
# accelerator_type = "NVIDIA_TESLA_A100"
# accelerator_count = 4

model_without_peft_vllm, endpoint_without_peft_vllm = deploy_model_vllm(
    model_name=get_job_name_with_datetime(prefix="llama2-serve-vllm"),
    model_id=base_model_id,
    service_account=SERVICE_ACCOUNT,
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
)

Creating Endpoint
Create Endpoint backing LRO: projects/725014442001/locations/us-central1/endpoints/6051133656162893824/operations/7675775816935931904
Endpoint created. Resource name: projects/725014442001/locations/us-central1/endpoints/6051133656162893824
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/725014442001/locations/us-central1/endpoints/6051133656162893824')
Operation still in progress
Operation still in progress
Operation still in progress
Operation completed successfully
Deploying model to Endpoint : projects/725014442001/locations/us-central1/endpoints/6051133656162893824
Deploy Endpoint model backing LRO: projects/725014442001/locations/us-central1/endpoints/6051133656162893824/operations/8682611808629948416


In [None]:
instance = {
    "prompt": "Hi, Google.",
    "n": 1,
    "max_tokens": 50,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
response = endpoint_without_peft_vllm.predict(instances=[instance])
print(response.predictions[0])