# Llama2 end-to-end training and inference pipeline

This note book shows how to create and setup an end-to-end training and deployment and inference pipeline, to demo LLMOps on Vertex AI. The model is Llama2, training based on Deepspeed-Chat, and Torchrun launcher. Deployment is based on vLLM. The training and serving are on Vertex AI. The pipeline also leverages Firestore(DataStore) to store model and endpoint information for future retrieval.

The pipeline DAG is as below:

<div align=center><img src="./pipeline-DAG.png" alt= “” width="500" height="320">

## Build serving docker image

In [None]:
%cd LLM-Tuning-On-GCP/Serve/vLLM-on-Vertex
PROJECT_ID="YOUR_PROJECT_ID"
LOCATION="us-central1"
AR_REPO="llama2"
IMAGE_NAME="llama2-serving"
TAG="vllm"
SERVE_IMAGE_NAME=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{AR_REPO}/{IMAGE_NAME}:{TAG}"

# You can use gcloud build to build the docker image directl in cloud
! gcloud builds submit --region=us-central1 --tag  . $SERVE_IMAGE_NAME

## Build training docker image

In [None]:
%cd LLM-Tuning-On-GCP/Train/Deepspeed/torchrun-launcher
AR_REPO="llama2"
IMG_NAME="deepspeed-chat"
TAG="vertex-torchrun"
TRAIN_IMAGE_URI=f"{LOCATION}-docker.pkg.dev/{PROJECT_ID}/{AR_REPO}/{IMG_NAME}:{TAG}"
EXAMPLE_DIR="deepspeed-chat"
DOCKERFILE=f"examples/{EXAMPLE_DIR}/{IMG_NAME}.Dockerfile"

# You can also build docker image locally and push it artifact registry
! gcloud auth configure-docker
! docker build . -t $TRAIN_IMAGE_URI -f $DOCKERFILE
! docker push $TRAIN_IMAGE_URI

## Install libary


In [None]:
! pip3 install google-cloud-aiplatform --upgrade -qq
! pip3 install google-cloud-pipeline-components --upgrade -qq
! pip3 install kfp

## Construct pipeline

In [4]:
# generate UUID for job name
import random
import string
from datetime import datetime

# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))

UUID = generate_uuid()

In [2]:
# import libraries
import google.cloud.aiplatform as aiplatform
import kfp
from kfp import compiler, dsl
from kfp.dsl import Artifact, Dataset, Input, Metrics, Model, Output, component

### Train component using Vertex Custom Training

1. This component only submits job, and not waiting for its status update
2. Training job id is returned as output, and logging uri is in artifact

In [18]:
@component(
    base_image='python:3.10',
    packages_to_install=[
        "google-cloud-aiplatform",
        "google-cloud-datastore"
    ],
)
def train_llama2(
      project: str,
      location: str,
      stage_bucket: str,
      train_job_display_name: str,
      machine_type: str,
      accelerator_type: str,
      train_container_image_uri: str,
      llm_model_uri: str,
      data_uri: str,
      service_account: str,
      tensorboard_name: str,
      output_gcs_uri: str,
      log_uri: Output[Artifact],
      accelerator_count: int = 4,
      replica_count: int = 2,
      per_device_batch_size: int = 4,
      
) -> str:
    from datetime import datetime
    import google.cloud.aiplatform as aiplatform
    aiplatform.init(project=project, location=location, staging_bucket=stage_bucket)
    
    ENVS = [
        {"name": "MODEL_PATH", "value": llm_model_uri},                   
        {"name": "DATA_PATHS", "value": data_uri},
        {"name": "DATA_SPLIT", "value": "10,0,0"},
        {"name": "ZERO_STAGE", "value": "3"},
        {"name": "PER_DEVICE_BATCH_SIZE", "value": f"{per_device_batch_size}"},
        {"name": "NUM_GPU_PER_NODE", "value": f"{accelerator_count}"},
    ]
    
    
    worker_pool_specs_1 = {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,       
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": train_container_image_uri,
            "command": [],
            "args": [],
            "env": ENVS,                
        },
        "disk_spec": {
            "boot_disk_size_gb": 1000,            
        }
    }
    worker_pool_specs_2 = {
        "machine_spec": {
            "machine_type": machine_type,
            "accelerator_type": accelerator_type,
            "accelerator_count": accelerator_count,       
        },
        "replica_count": replica_count - 1,
        "container_spec": {
            "image_uri": train_container_image_uri,
            "command": [],
            "args": [],
            "env": ENVS,                
        },
        "disk_spec": {
            "boot_disk_size_gb": 1000,            
        }
    }
    
    if replica_count == 1:
        worker_pool_specs = [worker_pool_specs_1 ]
    elif replica_count > 1:
        worker_pool_specs = [worker_pool_specs_1, worker_pool_specs_2]
    
    #build training job and submit
    tensorboard = aiplatform.Tensorboard(tensorboard_name=tensorboard_name)
    my_job = aiplatform.CustomJob(
        display_name=train_job_display_name,    
        worker_pool_specs=worker_pool_specs,
        base_output_dir=output_gcs_uri,)
    my_job.submit(    
        enable_web_access=True,
        service_account=service_account,
        tensorboard=tensorboard.resource_name,    
    )
    
    # job id is output, and log uri is artifact
    job_uri = my_job.to_dict()["name"]
    job_id = job_uri.split("/")[-1]
    timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
    log_uri_value = f"https://pantheon.corp.google.com/logs/query;query=resource.labels.job_id%3D%22{job_id}%22%20timestamp%3E%3D%22{timestamp}%22;duration=PT3H?mods=-ai_platform_fake_service&project={project}"
    log_uri.metadata["train_log_uri"] = log_uri_value
    
    #save to datastore
    from google.cloud import datastore
    datastore_client = datastore.Client(project=project)
    kind = "train"
    
    name = "train_job_uri"
    info_key = datastore_client.key(kind, name)
    info = datastore_client.get(info_key)
    if info == None:
        info = datastore.Entity(key=info_key)
        info["value"] = job_uri
        datastore_client.put(info)
    else:
        info["value"] = job_uri
        datastore_client.put(info)
    
    name = "train_log_uri"
    info_key = datastore_client.key(kind, name)
    info = datastore_client.get(info_key)
    if info == None:
        info = datastore.Entity(key=info_key)
        info["value"] = log_uri_value
        datastore_client.put(info)
    else:
        info["value"] = log_uri_value
        datastore_client.put(info)
        
    return job_uri


### Get training status

1. Looply check training job status
2. If success, component successfully finishes
3. If fail, we make the component failed

In [19]:
@component(
    base_image='python:3.10',
    packages_to_install=[
        "google-cloud-aiplatform",
    ],
)
def get_train_status(
      project: str,
      location: str,
      stage_bucket: str,
      job_name: str
):
    import google.cloud.aiplatform as aiplatform
    import time
    import logging
    
    aiplatform.init(project=project, location=location, staging_bucket=stage_bucket)
    job_instance = aiplatform.CustomJob.get(resource_name=job_name)
    logging.info("Start outputing training status here!")
    
    #if success, component successfully finishes, if fail, component failed.
    while(1):
        logging.info(f"{job_name}: {str(job_instance.state)}")
        assert str(job_instance.state) != "JobState.JOB_STATE_FAILED"
        if str(job_instance.state) == "JobState.JOB_STATE_SUCCEEDED":
            break
        time.sleep(10)

### Upload model to model registry

1. Check if model exists
2. If model doesn't exist, create a new model
3. If model exists, upload a new version and set the new version as default
4. Save model id to datastore

In [29]:
@component(
    base_image='python:3.10',
    packages_to_install=[
        "google-cloud-aiplatform",
        "google-auth",
        "google-api-core",
        "google-cloud-datastore"
    ],
)
def upload_model(
    project: str,
    location: str,
    model_display_name: str,
    model_uri: str,
    serving_container_uri: str,
) -> str: 
    import json
    import os
    import sys
    import logging
    import requests
    import time
    from datetime import datetime

    from google.cloud import aiplatform
    from google.auth.transport.requests import Request
    import google.auth
    from google.api_core import operations_v1
    
    
    model_name = f"projects/{project}/locations/{location}/models/{model_display_name}"
    model_if_exist = 1
    try:
        model_instance = aiplatform.Model(model_name=model_name)
    except:
        model_if_exist = 0
 
    
       
    # start upload model
    model_endpoint_url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/models:upload"
    
    # if you have lora model, use model_gcs_uri and peft_model_gcs_uri in args
    # --model_gcs_uri=gcs_uri
    # --peft_model_gcs_uri=gcs_uri
    # remove --model
    model_container_spec = {
      "imageUri": serving_container_uri,
      "command": [
          "python3",
          "/root/scripts/launcher.py"
        ],
      "args": [
          "--host=0.0.0.0",
          "--port=7080",
          f"--model={model_uri}/model",
          "--tensor-parallel-size=2",
          "--swap-space=16"
      ],
      "ports": [
        {
          "containerPort": 7080
        }
      ],
      "predictRoute": "/generate",
      "healthRoute": "/ping",
      "sharedMemorySizeMb": "6000"
    }

    model_source_info = {
        "sourceType": "CUSTOM"
    }

    # if model doesn't exist, creat a new model, if model exists, upload a new version and set it as default version
    if model_if_exist == 0:
        model_info = {
          "displayName": f"{model_display_name}",
          "containerSpec": model_container_spec,
          "modelSourceInfo": model_source_info
        }
            
        model_request = {
          "modelId": f"{model_display_name}",
          "model": model_info
        }
    else:
        model_info = {
          "displayName": f"{model_display_name}",
          "containerSpec": model_container_spec,
          "modelSourceInfo": model_source_info,
          "versionAliases": "default"
        }
        
        model_request = {
          "parentModel": f"{model_name}",
          "model": model_info
        }

    # Get the default credentials
    credentials, _ = google.auth.default()

    # Request an access token
    credentials.refresh(Request())

    # Get the access token
    access_token = credentials.token

    # Init Model upload request header
    headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
        }

    response = requests.post(model_endpoint_url, headers=headers, data=json.dumps(model_request))
    
    # parse the JSON response
    data = response.json()
    print(data)
    logging.info(data)
    # extract the operation ID
    operation_name = data['name']
    
    request = google.auth.transport.requests.Request()
    channel = google.auth.transport.grpc.secure_authorized_channel(
            credentials, request, f"{location}-aiplatform.googleapis.com")
    
    # create an operations client
    client = operations_v1.OperationsClient(channel=channel)

    # check if the operation is done
    while(True):
        # wait for 30 secs
        time.sleep(30)
        
        # get the operation
        operation = client.get_operation(operation_name)
        
        if operation.done:
            if operation.HasField('response'):
                print('Operation completed successfully')
                # you can access the response via operation.response
                break
            elif operation.HasField('error'):
                print('Operation failed')
                # you can access the error message via operation.error.message
                raise Exception(f"This is error when upload model: {operation.error.message}")
        else:
            logging.info('Operation still in progress')
    
    # Define your model name
    model_name = f"projects/{project}/locations/{location}/models/{model_display_name}"
    
    # save to datastore
    from google.cloud import datastore
    datastore_client = datastore.Client(project=project)
    kind = "model"
    name = model_display_name
    info_key = datastore_client.key(kind, name)
    info = datastore_client.get(info_key) 
    if info == None:
        info = datastore.Entity(key=info_key)
        info["value"] = model_name
        datastore_client.put(info)
    else:
        info["value"] = model_name
        datastore_client.put(info)

    
    return model_name

### Create endpoint

1. Get endpoint to check if it exists
2. If exists, it will skip endpoint creation
3. If doesn't exist, endpoint will be crerated
3. Save endpoint id to datastore

In [30]:
@component(
    base_image='python:3.10',
    packages_to_install=[
        "google-cloud-aiplatform",
        "google-cloud-datastore",
        "google-auth",
        "google-api-core"
    ],
)
def create_endpoint(
    endpoint_display_name: str,
    project: str,
    location: str,
    stage_bucket: str
) -> str:
    
    import json
    import os
    import sys
    import logging
    import requests
    import time
    from datetime import datetime

    from google.cloud import aiplatform
    from google.auth.transport.requests import Request
    import google.auth
    from google.api_core import operations_v1
    
    
    model_endpoint_url = f"https://{location}-aiplatform.googleapis.com/v1/projects/{project}/locations/{location}/endpoints?filter=display_name={endpoint_display_name}"
    # Get the default credentials
    credentials, _ = google.auth.default()

    # Request an access token
    credentials.refresh(Request())

    # Get the access token
    access_token = credentials.token

    # Init Model upload request header
    headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
        }

    response = requests.get(model_endpoint_url, headers=headers)
    
    # parse the JSON response
    data = response.json()
    logging.info(data)
   
    # if endpoint exists, get its id, if it doesn't exist, create endpoint first
    endpoint_id = None
    if data != {}:
        endpoint_id = data['endpoints'][0]['name']
        
    if endpoint_id == None:
        from google.cloud import aiplatform
        aiplatform.init(project=project, location=location, staging_bucket=stage_bucket)
        """Deploys trained models with vLLM into Vertex AI."""
        endpoint = aiplatform.Endpoint.create(display_name=endpoint_display_name)
        endpoint_id = endpoint.resource_name
    
    # save to datastore
    from google.cloud import datastore
    datastore_client = datastore.Client(project=project)
    kind = "endpoint"
    info_key = datastore_client.key(kind, endpoint_display_name)
    info = datastore_client.get(info_key) 
    if info == None:
        info = datastore.Entity(key=info_key)
        info["value"] = endpoint_id
        datastore_client.put(info)
    else:
        info["value"] = endpoint_id
        datastore_client.put(info)

    return endpoint_id

### Deploy model to endpoint

1. Check if model has been deployed to endpoint
2. If deployed, undeploy the model if the tag "if_force_undeploy" set as True, undeploy model first and then deploy the new model
3. If deployed, if the tag "if_force_undeploy" set as False, skip deploying model step and return directly
4. If not deployed, deploy the model

In [59]:
@component(
    base_image='python:3.10',
    packages_to_install=[
        "google-cloud-aiplatform",
        "google-cloud-datastore",
        "google-auth",
        "google-api-core"
    ],
)
def deploy_model_vllm(
    project: str,
    location: str,
    stage_bucket: str,
    service_account: str,
    model_name: str,
    endpoint_name: str,
    if_force_undeploy: bool = True,
    machine_type: str = "n1-standard-8",
    accelerator_type: str = "NVIDIA_TESLA_V100",
    accelerator_count: int = 1,
) -> str:
    
    import json
    import os
    import sys
    import logging
    import requests
    import time
    from datetime import datetime

    from google.cloud import aiplatform
    from google.auth.transport.requests import Request
    import google.auth
    from google.api_core import operations_v1
    
    # get endpoint to check if any models have been deployed to it
    model_endpoint_url = f"https://{location}-aiplatform.googleapis.com/v1/{endpoint_name}"
    # Get the default credentials
    credentials, _ = google.auth.default()

    # Request an access token
    credentials.refresh(Request())

    # Get the access token
    access_token = credentials.token

    # Init Model upload request header
    headers = {
            "Authorization": f"Bearer {access_token}",
            "Content-Type": "application/json"
        }

    response = requests.get(model_endpoint_url, headers=headers)
    
    # parse the JSON response
    data = response.json()
    logging.info(data)
    deployed = 1
    try:
        data['deployedModels']
    except:
        deployed = 0 
    
    
    from google.cloud import aiplatform
    aiplatform.init(project=project, location=location, staging_bucket=stage_bucket)
    
    model_instance = aiplatform.Model(model_name=model_name)
    endpoint_instance = aiplatform.Endpoint(endpoint_name=endpoint_name)
    status = "Deployed"
    if deployed == 1:
        if if_force_undeploy == False:
            status = "Existed and ignore deployment"
            return status
        else:
            endpoint_instance.undeploy_all(sync=True)
            status = "Existed, undeploy and redeploy"

    model_instance.deploy(
        endpoint=endpoint_instance,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
    )
    return status

### Construct the pipeline based on all the components

In [60]:
@kfp.dsl.pipeline(name="train-endpoint-deploy" + UUID)
def pipeline(
      project: str,
      location: str,
      stage_bucket: str,
      train_job_display_name: str,
      train_machine_type: str,
      train_accelerator_type: str,
      train_accelerator_count: int,
      replica_count: int,
      train_container_image_uri: str,
      llm_model_uri: str,
      data_uri: str,
      per_device_batch_size: int,
      output_gcs_uri: str,
      service_account: str,
      tensorboard: str,
      model_display_name: str,
      endpoint_display_name: str,
      if_force_undeploy: bool,
      serving_container_image_uri: str,
      serving_machine_type: str,
      serving_accelerator_type: str,
      serving_accelerator_count: int,
):

    train_op = train_llama2(
      project=project,
      location=location,
      stage_bucket=stage_bucket,
      train_job_display_name=train_job_display_name,
      machine_type=train_machine_type,
      accelerator_type=train_accelerator_type,
      replica_count=replica_count,
      accelerator_count=train_accelerator_count,
      train_container_image_uri=train_container_image_uri,
      llm_model_uri=llm_model_uri,
      data_uri=data_uri,
      per_device_batch_size=per_device_batch_size,
      output_gcs_uri=output_gcs_uri,
      service_account=service_account,
      tensorboard_name=tensorboard,
    )
    
    status_op = get_train_status(
      project=project,
      location=location,
      stage_bucket=stage_bucket,
      job_name=train_op.outputs["Output"]
    ).after(train_op)
    
    upload_model_op = upload_model(
        project=project,
        location=location,
        model_display_name=model_display_name,
        model_uri=output_gcs_uri,
        serving_container_uri=serving_container_image_uri,
    ).after(status_op)
    
    create_endpoint_op = create_endpoint(
        endpoint_display_name=endpoint_display_name,
        project=project,
        location=location,
        stage_bucket=stage_bucket,
    )
    
    deploy_op = deploy_model_vllm(
        project=project,
        location=location,
        stage_bucket=stage_bucket,
        service_account=service_account,
        model_name=upload_model_op.output,
        endpoint_name=create_endpoint_op.output,
        if_force_undeploy=if_force_undeploy,
        machine_type=serving_machine_type,
        accelerator_type=serving_accelerator_type,
        accelerator_count=serving_accelerator_count,
    ).after(upload_model_op)
    

### Compile the pipeline template and save to local

In [61]:
from kfp import compiler 
package_path = "llama2_deepspeed_pipeline.json"
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=package_path,
)

## Config parameters and submit a pipeline job

In [66]:
# create working dir to pass to job spec
PROJECT_ID = "YOUR-PROJECT-ID"  # @param {type:"string"}
REGION = "us-central1"  # @param {type: "string"}
BUCKET_URI = "gs://BUCKET-NAME/pipeline"  # @param {type:"string"}
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/llama2"
TENSORBOARD_NAME = "TENSORBOARD-ID" # @param {type:"string"}
SERVICE_ACCOUNT = "SERVICE-ACCOUNT"  # @param {type:"string"}
WORKING_DIR = f"{PIPELINE_ROOT}/{UUID}"

train_job_display_name = f'llama2-custom-train-{datetime.today().strftime("%Y%m%d%M%S")}' # @param {type:"string"}
TRAIN_IMAGE_URI = TRAIN_IMAGE_URI
llm_model_uri = "/gcs/deepspeed_repo/base_model/Llama-2-7b-hf/Llama-2-7b-hf" # @param {type:"string"}
data_uri = "/gcs/deepspeed_repo/dataset/samsum" # @param {type:"string"}

TRAIN_ACCELERATOR_TYPE = "NVIDIA_L4"
TOTAL_GPU_NEEDED = 8
TRAIN_ACCELERATOR_COUNT = 4
CPU_UNIT = 12
CPU_COUNT = CPU_UNIT * TRAIN_ACCELERATOR_COUNT
TRAIN_MACHINE_TYPE = f"g2-standard-{CPU_COUNT}"
TRAIN_REPLICA_COUNT = int(TOTAL_GPU_NEEDED / TRAIN_ACCELERATOR_COUNT)

MODEL_DISPLAY_NAME = f"llama2_pipeline_vllm_{UUID}"
ENDPOINT_DISPLAY_NAME = f"{MODEL_DISPLAY_NAME}-endpoint"
IF_FORCE_UNDEPLOY = True
SERVING_IMAGE_URI = SERVE_IMAGE_URI
SERVING_MACHINE_TYPE="g2-standard-24"
SERVING_ACCELERATOR_TYPE="NVIDIA_L4"
SERVING_ACCELERATOR_COUNT=2

In [67]:
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)

In [68]:
#tensorboard = aiplatform.Tensorboard(tensorboard_name=TENSORBOARD_NAME)
#tensorboard.resource_name

'projects/703099487153/locations/us-central1/tensorboards/2069606338916253696'

In [None]:
DISPLAY_NAME = "llama2_deepspeed_pipeline_" + UUID

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=package_path,
    pipeline_root=PIPELINE_ROOT,
    parameter_values= {
    'train_job_display_name': train_job_display_name,
    'project': PROJECT_ID,
    'location': REGION,
    'stage_bucket': BUCKET_URI,
    'train_machine_type': TRAIN_MACHINE_TYPE,
    'train_accelerator_type': TRAIN_ACCELERATOR_TYPE,
    'train_accelerator_count': TRAIN_ACCELERATOR_COUNT,
    'replica_count': TRAIN_REPLICA_COUNT,
    'train_container_image_uri': TRAIN_IMAGE_URI,
    'llm_model_uri': llm_model_uri,
    'data_uri': data_uri,
    'per_device_batch_size': 4,
    'output_gcs_uri': WORKING_DIR,
    'service_account': SERVICE_ACCOUNT,
    'tensorboard': TENSORBOARD_NAME,
    'model_display_name': MODEL_DISPLAY_NAME,
    'endpoint_display_name': ENDPOINT_DISPLAY_NAME,
    'if_force_undeploy': IF_FORCE_UNDEPLOY,
    'serving_container_image_uri': SERVING_IMAGE_URI,
    'serving_machine_type': SERVING_MACHINE_TYPE,
    'serving_accelerator_type': SERVING_ACCELERATOR_TYPE,
    'serving_accelerator_count': SERVING_ACCELERATOR_COUNT,
      
  },
    enable_caching=True,
)

job.run(service_account=SERVICE_ACCOUNT)

Creating PipelineJob
PipelineJob created. Resource name: projects/703099487153/locations/us-central1/pipelineJobs/train-endpoint-deploy0qehgs0f-20231201072301
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/703099487153/locations/us-central1/pipelineJobs/train-endpoint-deploy0qehgs0f-20231201072301')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/train-endpoint-deploy0qehgs0f-20231201072301?project=703099487153
PipelineJob projects/703099487153/locations/us-central1/pipelineJobs/train-endpoint-deploy0qehgs0f-20231201072301 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/703099487153/locations/us-central1/pipelineJobs/train-endpoint-deploy0qehgs0f-20231201072301 current state:
PipelineState.PIPELINE_STATE_RUNNING
PipelineJob projects/703099487153/locations/us-central1/pipelineJobs/train-endpoint-deploy0qehgs0f-20231201072301 current state:
PipelineState.PIPELINE_S

## Inference test

Get endpoint id from DataStore and test inference

In [None]:
from google.cloud import datastore
datastore_client = datastore.Client(project=project)
kind = "endpoint"
info_key = datastore_client.key(kind, ENDPOINT_DISPLAY_NAME)
info = datastore_client.get(info_key)

instance = {
    "prompt": "Hi, Google.",
    "n": 1,
    "max_tokens": 50,
    "temperature": 1.0,
    "top_p": 1.0,
    "top_k": 10,
}
endpoint_instance = aiplatform.Endpoint(endpoint_name=info["value"])
response = endpoint_instance.predict(instances=[instance])
print(response.predictions[0])