### Imports

In [1]:
import os
from typing import Tuple
from google.cloud import aiplatform
import os 
from dotenv import load_dotenv
import vertexai
from vertexai import model_garden


### Load Environment Variables and Configuring gcloud Configuration

In [None]:
load_dotenv()

PROJECT_ID = os.environ["PROJECT_ID"]
REGION = os.environ["REGION"]
HF_TOKEN = os.environ.get("HF_TOKEN", os.environ["HF_TOKEN"])
HUGGING_FACE_MODEL_ID = os.environ["HUGGING_FACE_MODEL_ID"]
MACHINE_TYPE = os.environ["MACHINE_TYPE"]
ACCELERATOR_TYPE = os.environ["ACCELERATOR_TYPE"]
ACCELERATOR_COUNT = int(os.environ.get("ACCELERATOR_COUNT", 1))
TGI_DOCKER_URI = os.environ["TGI_DOCKER_URI"]
SERVING_CONTAINER_IMAGE_URI = TGI_DOCKER_URI
USE_DEDICATED_ENDPOINT = os.environ.get("USE_DEDICATED_ENDPOINT", "True").lower() == "true"
LABEL = os.environ.get("LABEL", "tgi")

! gcloud auth login
! gcloud config set project {PROJECT_ID}
! gcloud auth application-default set-quota-project {PROJECT_ID}
! gcloud services enable aiplatform.googleapis.com compute.googleapis.com


Updated property [core/project].

Credentials saved to file: [C:\Users\Luis\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "sacred-epigram-462314-r4" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.
Operation "operations/acat.p2-198380589718-12f700ee-52c3-48b1-a980-a021e9ffa0fc" finished successfully.


### Initializing Vertex AI Module 

In [3]:
models, endpoints = {}, {}

vertexai.init(
    project=PROJECT_ID,
    location=REGION,
)

### [Option 1] Deploy with Model Garden SDK

In [5]:
model = model_garden.OpenModel(HUGGING_FACE_MODEL_ID)
endpoints[LABEL] = model.deploy(
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count= ACCELERATOR_COUNT,
    hugging_face_access_token=HF_TOKEN,
    use_dedicated_endpoint=USE_DEDICATED_ENDPOINT,
    accept_eula=True
)

endpoint = endpoints[LABEL]

Deploying model: nvidia/Nemotron-Research-Reasoning-Qwen-1.5B
LRO: projects/198380589718/locations/us-east1/operations/716479022615232512
Start time: 2025-07-25 10:00:59.501086
End time: 2025-07-25 10:24:14.924497
Endpoint: projects/sacred-epigram-462314-r4/locations/us-east1/endpoints/mg-endpoint-1753452059


### [Option 2] Deploy with customized configs

In [None]:
import importlib

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

def deploy_model_tgi(
    model_name: str,
    model_id: str,
    publisher: str,
    publisher_model_id: str,
    service_account: str = None,
    machine_type: str = "g2-standard-8",
    accelerator_type: str = "NVIDIA_L4",
    accelerator_count: int = 1,
    max_input_length: int = 2047,
    max_total_tokens: int = 2048,
    max_batch_prefill_tokens: int = 2048,
    use_dedicated_endpoint: bool = False,
) -> Tuple[aiplatform.Model, aiplatform.Endpoint]:
    """Deploys models with TGI on GPU in Vertex AI."""
    endpoint = aiplatform.Endpoint.create(
        display_name=f"{model_name}-endpoint",
        dedicated_endpoint_enabled=use_dedicated_endpoint,
    )

    env_vars = {
        "MODEL_ID": model_id,
        "NUM_SHARD": f"{accelerator_count}",
        "MAX_INPUT_LENGTH": f"{max_input_length}",
        "MAX_TOTAL_TOKENS": f"{max_total_tokens}",
        "MAX_BATCH_PREFILL_TOKENS": f"{max_batch_prefill_tokens}",
        "DEPLOY_SOURCE": "notebook",
    }

    # HF_TOKEN is not a compulsory field and may not be defined.
    try:
        if HF_TOKEN:
            env_vars["HF_TOKEN"] = HF_TOKEN
    except NameError:
        pass

    if service_account:
        env_vars["SERVICE_ACCOUNT"] = service_account

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=TGI_DOCKER_URI,
        serving_container_ports=[8080],
        serving_container_environment_variables=env_vars,
        serving_container_shared_memory_size_mb=(16 * 1024),  # 16 GB
        model_garden_source_model_name=(
            f"publishers/{publisher}/models/{publisher_model_id}"
        ),
    )

    model.deploy(
        endpoint=endpoint,
        machine_type=machine_type,
        accelerator_type=accelerator_type,
        accelerator_count=accelerator_count,
        deploy_request_timeout=1800,
        service_account=service_account,
        system_labels={
            "NOTEBOOK_NAME": "model_garden_huggingface_tgi_deployment.ipynb",
            "NOTEBOOK_ENVIRONMENT": common_util.get_deploy_source(),
        },
    )
    return model, endpoint


models["tgi"], endpoints["tgi"] = deploy_model_tgi(
    model_name=common_util.get_job_name_with_datetime(prefix=HUGGING_FACE_MODEL_ID),
    model_id=HUGGING_FACE_MODEL_ID,
    publisher="google",
    publisher_model_id="gemma2",
    service_account="",
    machine_type=MACHINE_TYPE,
    accelerator_type=ACCELERATOR_TYPE,
    accelerator_count=ACCELERATOR_COUNT,
    use_dedicated_endpoint=USE_DEDICATED_ENDPOINT,
)

### Predict

In [None]:
prompt = "Tell me something interesting about yourself as a predictive model"

# Construct the request payload in the chat completions format
instances = [
    {
        "@requestFormat": "chatCompletions",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.8,
        "max_new_tokens": -1,
        "top_k": 50,
        "top_p": 1.0,
        "repetition_penalty": 1.2
    }
]

response = endpoints[LABEL].predict(
    instances=instances,
    use_dedicated_endpoint=USE_DEDICATED_ENDPOINT
)

text_content = response.predictions['choices'][0]['message']['content']
print(text_content)

Okay, so I need to tell someone something interesting about myself being a predictive model. Hmm, where do I start? Well, first of all, I should think about what makes me unique in the predictive modeling field.

Predictive models are like these mathematical or statistical equations that predict future outcomes based on past data. So maybe instead of just stating it's an AI or some software tool, perhaps talk about my methodology?

Wait, but the user asked for something interesting about myself as a predictive model. Maybe I can say how versatile I am or how impactful my work has been?

But wait another thought: Perhaps focus more on my role and achievements beyond just being a model. Like, have I contributed significantly to certain fields? Or did I win awards for having good predictions?

Alternatively, maybe mention something about personal growth through working on predictive models. But since they specifically mentioned "predicative model," which seems off.

Or perhaps explain how

### Delete the models and endpoints

In [35]:
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

Undeploying Endpoint model: projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059
Undeploy Endpoint model backing LRO: projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059/operations/5280877244955230208
Endpoint model undeployed. Resource name: projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059
Deleting Endpoint : projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059
Endpoint deleted. . Resource name: projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059
Deleting Endpoint resource: projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059
Delete Endpoint backing LRO: projects/198380589718/locations/us-east1/operations/2975034235741536256
Endpoint resource projects/198380589718/locations/us-east1/endpoints/mg-endpoint-1753452059 deleted.
