In [6]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemma deployment to GKE using vLLM on GPU



## Overview

This notebook demonstrates downloading and deploying Gemma, open models from Google DeepMind. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.


### Objective

Deploy and run inference for serving Gemma with vLLM on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)




### Pre requisites
- Install Google Cloud CLI
https://cloud.google.com/sdk/docs/install-sdk  (note: pre-installed in Cloud Shell)
- Install kubectl (note: pre-installed in Cloud Shell)
- Create a .env file with the following values


```
PROJECT_ID
REGION 
HF_TOKEN # Optional if you want to download model from Hugging Face
KSA_NAME 
PROJECT_NUMBER
CLUSTER_NAME 

# Assuming the model is saved at : gs://{MODEL_BUCKET}/{MODEL_NAME}/{MODEL_VERSION}
MODEL_BUCKET 
MODEL_NAME
MODEL_VERSION
IMAGE_NAME
```




## Create a GKE cluster and node pool

In [None]:
!pip install py



**Restart the runtime session**

In [17]:
from dotenv import load_dotenv
import os


load_dotenv()  # This loads the variables from .env into the environment

PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
#HF_TOKEN = os.getenv("HF_TOKEN")
PROJECT_NUMBER = os.getenv("PROJECT_NUMBER")
KSA_NAME = os.getenv("KSA_NAME")
CLUSTER_NAME = os.getenv("CLUSTER_NAME")

# Assuming the model is saved at : gs://{MODEL_BUCKET}/{MODEL_NAME}/{MODEL_VERSION}
MODEL_BUCKET = os.getenv("MODEL_BUCKET")
MODEL_NAME = os.getenv("MODEL_NAME")
MODEL_VERSION = os.getenv("MODEL_VERSION")
IMAGE_NAME = os.getenv("IMAGE_NAME")


In [None]:
# The HuggingFace token used to download models.

# assert HF_TOKEN, "Set Hugging Face access token in `HF_TOKEN`."

# Set up gcloud.
!gcloud config set project {PROJECT_ID}
!gcloud services enable container.googleapis.com



[1;31mERROR:[0m (gcloud.config.set) unrecognized arguments: -kokiri-dev 

To search the help text of gcloud commands, run:
  gcloud help -- SEARCH_TERMS


In [16]:
import datetime
import os


# # Create a unique cluster name to avoid conflicts.
# now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
# CLUSTER_NAME=f"gke-gemma-cluster-test-{now}"

print(f"Creating cluster: {CLUSTER_NAME}")

! gcloud container clusters create {CLUSTER_NAME} \
    --project={PROJECT_ID} \
    --region={REGION} \
    --subnetwork="default" \
    --workload-pool={PROJECT_ID}.svc.id.goog \
    --release-channel=rapid \
    --num-nodes=4 \
    --enable-shielded-nodes \
    --shielded-secure-boot\
    --shielded-integrity-monitoring \
    --addons=GcsFuseCsiDriver

! gcloud container node-pools create gpupool \
    --accelerator=type=nvidia-l4,count=2,gpu-driver-version=latest \
    --project={PROJECT_ID} \
    --location={REGION} \
    --node-locations={REGION}-a \
    --cluster={CLUSTER_NAME} \
    --machine-type=g2-standard-24 \
    --num-nodes=1 \
    --shielded-secure-boot \
    --shielded-integrity-monitoring

! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}


Creating cluster: crashai-lab
Note: The Kubelet readonly port (10255) is now deprecated. Please update your workloads to use the recommended alternatives. See https://cloud.google.com/kubernetes-engine/docs/how-to/disable-kubelet-readonly-port for ways to check usage and for migration instructions.
Note: Your Pod address range (`--cluster-ipv4-cidr`) can accommodate at most 1008 node(s).
[1;31mERROR:[0m (gcloud.container.clusters.create) ResponseError: code=409, message=Already exists: projects/prj-kokiri-dev/locations/us-central1/clusters/crashai-lab.
Note: Machines with GPUs have certain limitations which may affect your workflow. Learn more at https://cloud.google.com/kubernetes-engine/docs/how-to/gpus
Note: Starting in GKE 1.30.1-gke.115600, if you don't specify a driver version, GKE installs the default GPU driver for your node's GKE version.
[1;31mERROR:[0m (gcloud.container.node-pools.create) ResponseError: code=409, message=Already exists: projects/prj-kokiri-dev/locations/

In [12]:
! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}

client [kubectl]. To install, run
  $ gcloud components install kubectl

Fetching cluster endpoint and auth data.
[1;31mCRITICAL: ACTION REQUIRED: gke-gcloud-auth-plugin, which is needed for continued use of kubectl, was not found or is not executable. Install gke-gcloud-auth-plugin for use with kubectl by following https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-access-for-kubectl#install_plugin[0m
kubeconfig entry generated for crashai-lab.


### (Optional, only needed if load model from HF) Create a Kubernetes secret for Hugging Face credentials

In [2]:
# Create Kubernetes secret for Hugging Face credentials
#! kubectl create secret generic hf-secret \
#    --from-literal=hf_api_token={HF_TOKEN} \
#    --dry-run=client -o yaml > hf-secret.yaml
#
#! kubectl apply -f hf-secret.yaml

In [6]:
!kubectl create serviceaccount {KSA_NAME}


serviceaccount/crashai-lab created


In [7]:
! gcloud projects add-iam-policy-binding projects/{PROJECT_ID} \
    --role=roles/container.clusterViewer \
    --member=principal://iam.googleapis.com/projects/{PROJECT_NUMBER}/locations/global/workloadIdentityPools/{PROJECT_ID}.svc.id.goog/subject/ns/default/sa/{KSA_NAME} \
    --condition=None

Updated IAM policy for project [prj-kokiri-dev].
bindings:
- members:
  - serviceAccount:service-933718959305@gcp-sa-aiplatform-cc.iam.gserviceaccount.com
  role: roles/aiplatform.customCodeServiceAgent
- members:
  - serviceAccount:service-933718959305@gcp-sa-aiplatform.iam.gserviceaccount.com
  role: roles/aiplatform.serviceAgent
- members:
  - serviceAccount:service-933718959305@gcp-sa-vertex-telemetry.iam.gserviceaccount.com
  role: roles/aiplatform.telemetryServiceAgent
- members:
  - serviceAccount:933718959305-compute@developer.gserviceaccount.com
  - serviceAccount:service-933718959305@gcp-sa-alloydb.iam.gserviceaccount.com
  role: roles/aiplatform.user
- members:
  - group:cloud-developers@mbychkowski.altostrat.com
  role: roles/alloydb.databaseUser
- members:
  - serviceAccount:service-933718959305@gcp-sa-alloydb.iam.gserviceaccount.com
  role: roles/alloydb.serviceAgent
- members:
  - serviceAccount:service-933718959305@gcp-sa-servicemesh.iam.gserviceaccount.com
  role: role

### (Optional)Copy Gemma3 model data to your bucket

In [16]:
# Source model: gs://<source_bucket>/<model_name>/<model_version>
## Assuming the model will be saved at : gs://{MODEL_BUCKET}/{MODEL_NAME}/{MODEL_VERSION}

#! gcloud storage cp --recursive gs://the-fine-tuners/gemma3-1b-vertex gs://{MODEL_BUCKET}

In [8]:
! gcloud storage buckets add-iam-policy-binding gs://{MODEL_BUCKET} \
    --role=roles/storage.objectViewer \
    --member=principal://iam.googleapis.com/projects/{PROJECT_NUMBER}/locations/global/workloadIdentityPools/{PROJECT_ID}.svc.id.goog/subject/ns/default/sa/{KSA_NAME} \
    --condition=None

bindings:
- members:
  - projectEditor:prj-kokiri-dev
  - projectOwner:prj-kokiri-dev
  role: roles/storage.legacyBucketOwner
- members:
  - projectViewer:prj-kokiri-dev
  role: roles/storage.legacyBucketReader
- members:
  - projectEditor:prj-kokiri-dev
  - projectOwner:prj-kokiri-dev
  role: roles/storage.legacyObjectOwner
- members:
  - projectViewer:prj-kokiri-dev
  role: roles/storage.legacyObjectReader
- members:
  - principal://iam.googleapis.com/projects/933718959305/locations/global/workloadIdentityPools/prj-kokiri-dev.svc.id.goog/subject/ns/default/sa/crashai
  - principal://iam.googleapis.com/projects/933718959305/locations/global/workloadIdentityPools/prj-kokiri-dev.svc.id.goog/subject/ns/default/sa/crashai-lab
  role: roles/storage.objectViewer
etag: CAU=
kind: storage#policy
resourceId: projects/_/buckets/prj-kokiri-dev-the-fine-tuners
version: 1


In [18]:

V_MODEL_BUCKET = MODEL_BUCKET
V_MODEL_NAME =MODEL_NAME
V_MODEL_VERSION=MODEL_VERSION
V_IMAGE_NAME=IMAGE_NAME
V_KSA=KSA_NAME

In [20]:
# @title Deploy Gemma3

# @markdown This section deploys Gemma.

# @markdown Select one of the following model version and size options:

# The size of the model to launch


K8S_YAML_GCS=f"""apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm-gemma3-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: gemma-server
  template:
    metadata:
      labels:
        app: gemma-server
      annotations:
        gke-gcsfuse/volumes: "true"
    spec:
      containers:
      - name: inference-server
        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
        args:
        - --model=$(MODEL)
        - --tensor-parallel-size=1
        - --host=0.0.0.0
        - --port=8000
        env:
        - name: MODEL
          value: /gcs/{V_MODEL_NAME}/{V_MODEL_VERSION}
        - name: VLLM_ATTENTION_BACKEND
          value: FLASHINFER
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve
        readinessProbe:
          failureThreshold: 3
          httpGet:
            path: /health
            port: 8000
            scheme: HTTP
          initialDelaySeconds: 240
          periodSeconds: 5
          successThreshold: 1
          timeoutSeconds: 1
        resources:
          requests:
            cpu: "2"
            memory: "25Gi"
            nvidia.com/gpu: "1"
          limits:
            cpu: "2"
            memory: "25Gi"
            nvidia.com/gpu: "1"
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
        - name: gcs-fuse-csi-ephemeral
          mountPath: /gcs
          readOnly: true
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
        cloud.google.com/gke-gpu-driver-version: latest
      serviceAccountName: {V_KSA}
      tolerations:
      - key: "nvidia.com/gpu"
        operator: "Exists"
        effect: "NoSchedule"
      - key: "on-demand"
        value: "true"
        operator: "Equal"
        effect: "NoSchedule"
      volumes:
      - name: dshm
        emptyDir:
            medium: Memory
      - name: gcs-fuse-csi-ephemeral
        csi:
          driver: gcsfuse.csi.storage.gke.io
          volumeAttributes:
            bucketName: {V_MODEL_BUCKET}
            mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:max-parallel-downloads:-1"
            fileCacheCapacity: "20Gi"
            fileCacheForRangeRead: "true"
            metadataStatCacheCapacity: "-1"
            metadataTypeCacheCapacity: "-1"
            metadataCacheTTLSeconds: "-1"
---
apiVersion: v1
kind: Service
metadata:
  name: llm-service
spec:
  selector:
    app: gemma-server
  type: ClusterIP
  ports:
    - protocol: TCP
      port: 8000
      targetPort: 8000

"""

K8S_YAML_HF="""apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm-gemma-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: gemma-server
  template:
    metadata:
      labels:
        app: gemma-server
        ai.gke.io/model: gemma-3-1b-it
        ai.gke.io/inference-server: vllm
        examples.ai.gke.io/source: user-guide
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01
        resources:
          requests:
            cpu: "4"
            memory: "30Gi"
            ephemeral-storage: "30Gi"
            nvidia.com/gpu: "1"
          limits:
            cpu: "4"
            memory: "30Gi"
            ephemeral-storage: "30Gi"
            nvidia.com/gpu: "1"
        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
        args:
        - --model=$(MODEL_ID)
        - --tensor-parallel-size=1
        - --host=0.0.0.0
        - --port=8000
        env:
        - name: MODEL_ID
          value: google/gemma-3-1b-it
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-secret
              key: hf_api_token
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
            medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
        cloud.google.com/gke-gpu-driver-version: latest
---
apiVersion: v1
kind: Service
metadata:
  name: llm-service
spec:
  selector:
    app: gemma-server
  type: ClusterIP
  ports:
    - protocol: TCP
      port: 8000
      targetPort: 8000
"""

with open("vllm-3-1b-it-ft.yaml", "w") as f:
    f.write(K8S_YAML_GCS)

! kubectl apply -f vllm-3-1b-it-ft.yaml

# Wait for container to be created.
import time

print("Waiting for container to be created...\n")
while True:
    shell_output = ! kubectl get pod
    container_status = "\n".join(shell_output)
    if "1/1" in container_status or "Running" in container_status:
        break
    time.sleep(5)

print(container_status)

# Wait for downloading artifacts.
print("\nDownloading artifacts...")
while True:
    shell_output = ! kubectl logs -l app=gemma-server
    logs = "\n".join(shell_output)
    if "Application startup complete" in logs:
        break
    time.sleep(5)

print("Server is up and running.")

deployment.apps/vllm-gemma3-deployment created
service/llm-service unchanged
Waiting for container to be created...

NAME                                      READY   STATUS        RESTARTS   AGE
vllm-gemma3-deployment-67cdb4c9db-dv9b5   1/2     Terminating   0          7m7s
vllm-gemma3-deployment-67cdb4c9db-tnmdh   1/2     Running       0          6m59s

Downloading artifacts...
Server is up and running.


### Follow this to test the model
Connect to the cluster

```
gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}
```

Forward the port

```
kubectl port-forward service/llm-service 8000:8000
```

Open another terminal:
```
curl http://34.118.236.143:8000/v1/chat/completions \
-X POST \
-H "Content-Type: application/json" \
-d '{
    "model": "<your model name>",
    "messages": [
        {
          "role": "user",
          "content": "what is the capital of USA?"
        }
    ]
}'
```


reference: https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-gpu-vllm#serve-model

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.

! kubectl delete deployments vllm-gemma3-deployment
! kubectl delete services llm-service
! kubectl delete secrets hf-secret

DELETE_CLUSTER = False # @param {type: "boolean"}

if DELETE_CLUSTER:
  ! gcloud container clusters delete {CLUSTER_NAME} \
    --region={REGION} \
    --quiet