In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<a href="https://colab.research.google.com/github/moficodes/ai-on-gke/blob/main/tutorials/finetune-gemma-2b-on-l4/finetune-gemma-on-gke.ipynb" target="_blank"><img height="40" alt="Run your own notebook in Colab" src = "https://colab.research.google.com/assets/colab-badge.svg"></a>

# Finetune Gemma to GKE using GPU

## Overview

This notebook demonstrates downloading and fine tuning Gemma, open models from Google DeepMind using Pytorch and Hugging Face Libraries In this notebook we will finetune and publish Gemma model on Hugging Face. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.


### Objective

Finetune and Publish Gemma with Transformers and Lora on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)



## Before you begin

### Configure Environment

Set the following variables for the experiment environment.

In [None]:
# The HuggingFace token used to download models.
# Make sure Token has Write Permission
HF_TOKEN = "<YOUR_HF_TOKEN>"  # @param {type:"string"}

# The size of the model to launch
MODEL_SIZE = "2b"  # @param ["2b", "7b"]

# Cloud project id.
PROJECT_ID = "<YOUR_PROJECT_ID>"  # @param {type:"string"}

# Region for launching clusters.
REGION = "us-central1"  # @param {type:"string"}

# The cluster name to create
CLUSTER_NAME = "gke-gemma-cluster"  # @param {type:"string"}

# The number of GPUs to run
GPU_COUNT = 8

In [None]:
! gcloud auth login
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

### Create a GKE cluster and a node pool

GKE creates the following resources for the model based on the MODEL_SIZE environment variable set above.

- Autopilot cluster

If you already have a cluster, you can skip to `Use an existing GKE cluster` instead.

In [None]:
! gcloud container clusters create-auto {CLUSTER_NAME} \
  --project={PROJECT_ID} \
  --region={REGION} \
  --release-channel=rapid \
  --cluster-version=1.29

### Use an existing GKE cluster

In [None]:
! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}

### Create Kubernetes secret for Hugging Face credentials

Create a Kubernetes Secret that contains the Hugging Face token.

In [None]:
! kubectl create secret generic hf-secret \
--from-literal=hf_api_token={HF_TOKEN} \
--dry-run=client -o yaml | kubectl apply -f -

## The Dataset
We use Lora to quickly finetune Gemma with `b-mc2/sql-create-context` dataset.

This dataset has the following structure.

| Answer                                              | Question                                                                      | Context                                                           |
|-----------------------------------------------------|-------------------------------------------------------------------------------|-------------------------------------------------------------------|
| SELECT COUNT(*) FROM head WHERE age > 56            | How many heads of the departments are older than 56 ?                         | CREATE TABLE head (age INTEGER)                                   |
| SELECT name, born_state, age FROM head ORDER BY age | List the name, born state and age of the heads of departments ordered by age. | CREATE TABLE head (name VARCHAR, born_state VARCHAR, age VARCHAR) |

We will finetune `google/gemma-2b` model to get SQL queries based on questions and context.

## Finetuning Gemma on GKE using GPU with Pytorch

In this demo we will use Pytorch and Huggingface libraries to finetune Gemma. We use the `finetune.py` file.

```python
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel

from trl import SFTTrainer

# The model that you want to train from the Hugging Face hub
model_name = os.getenv("MODEL_NAME", "google/gemma-2b")

# The instruction dataset to use
dataset_name = "b-mc2/sql-create-context"

# Fine-tuned model name
new_model = os.getenv("NEW_MODEL", "gemma-2b-sql")

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = int(os.getenv("LORA_R", "4"))

# Alpha parameter for LoRA scaling
lora_alpha = int(os.getenv("LORA_ALPHA", "8"))

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = int(os.getenv("TRAIN_BATCH_SIZE", "1"))

# Batch size per GPU for evaluation
per_device_eval_batch_size = int(os.getenv("EVAL_BATCH_SIZE", "2"))

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = int(os.getenv("GRADIENT_ACCUMULATION_STEPS", "1"))

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = int(os.getenv("LOGGING_STEPS", "50"))

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = int(os.getenv("MAX_SEQ_LENGTH", "512"))

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {'':torch.cuda.current_device()}

# Set limit to a positive number
limit = int(os.getenv("DATASET_LIMIT", "5000"))

dataset = load_dataset(dataset_name, split="train")
if limit != -1:
    dataset = dataset.shuffle(seed=42).select(range(limit))


def transform(data):
    question = data['question']
    context = data['context']
    answer = data['answer']
    template = "Question: {question}\nContext: {context}\nAnswer: {answer}"
    return {'text': template.format(question=question, context=context, answer=answer)}


transformed = dataset.map(transform)

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16")
        print("=" * 80)

# Load base model
# model = AutoModelForCausalLM.from_pretrained("google/gemma-7b")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    torch_dtype=torch.float16,
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=transformed,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

trainer.train()

trainer.model.save_pretrained(new_model)

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()



model.push_to_hub(new_model, check_pr=True)

tokenizer.push_to_hub(new_model, check_pr=True)
```

## Create a Container Manifest with Dockerfile

Use the following `Dockerfile` to create a container image.

```bash
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

RUN apt-get update && \
    apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \
    rm -rf /var/lib/apt/lists/*

RUN pip3 install --no-cache-dir accelerate bitsandbytes datasets transformers peft trl torch

COPY finetune.py /finetune.py

ENV PYTHONUNBUFFERED 1

CMD python3 /finetune.py --device cuda
```

In [None]:
DOCKERFILE = """
FROM nvidia/cuda:12.2.0-runtime-ubuntu22.04

RUN apt-get update && \
    apt-get -y --no-install-recommends install python3-dev gcc python3-pip git && \
    rm -rf /var/lib/apt/lists/*

RUN pip3 install --no-cache-dir accelerate bitsandbytes datasets transformers peft trl torch

COPY finetune.py /finetune.py

ENV PYTHONUNBUFFERED 1

CMD python3 /finetune.py --device cuda
"""

with open("Dockerfile", "w") as f:
    f.write(DOCKERFILE)

### Containerize the Code with Docker and Cloud Build

Using Cloud Build and the following Dockerfile we build and push the image in Artifact Registry Docker Repository.

In [None]:
# Create a Artifact Registry Repo
! gcloud artifacts repositories create gemma \
    --project={PROJECT_ID} \
    --repository-format=docker \
    --location=us \
    --description="Gemma Repo"

In [None]:
# Build and push the image using Cloud Build
! gcloud builds submit \
    --tag us-docker.pkg.dev/{PROJECT_ID}/gemma/finetune-gemma-gpu:1.0.0 .

## Run Finetune Job on GKE Autopilot

Use the YAML to run Gemma Finetune on GKE

In [None]:
K8S_JOB_YAML = f"""
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: batch/v1
kind: Job
metadata:
  name: finetune-job
  namespace: default
spec:
  backoffLimit: 2
  template:
    metadata:
      annotations:
        kubectl.kubernetes.io/default-container: finetuner
    spec:
      terminationGracePeriodSeconds: 600
      containers:
      - name: finetuner
        image: <YOUR_IMAGE>
        resources:
          limits:
            nvidia.com/gpu: 8
        env:
        - name: MODEL_NAME
          value: "google/gemma-2b"
        - name: NEW_MODEL
          value: "gemma-2b-sql-kubecon-eu-2024"
        - name: LORA_R
          value: "8"
        - name: LORA_ALPHA
          value: "16"
        - name: TRAIN_BATCH_SIZE
          value: "1"
        - name: EVAL_BATCH_SIZE
          value: "2"
        - name: GRADIENT_ACCUMULATION_STEPS
          value: "2"
        - name: DATASET_LIMIT
          value: "1000"
        - name: MAX_SEQ_LENGTH
          value: "512"
        - name: LOGGING_STEPS
          value: "5"
        - name: HF_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-secret
              key: hf_api_token
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
      restartPolicy: OnFailure
"""

with open("finetune.yaml", "w") as f:
    f.write(K8S_JOB_YAML)

In [None]:
!kubectl apply -f finetune.yaml

#### Waiting for the container to create

Use the command below to check on the status of the container.

In [None]:
! kubectl get po -l job-name=finetune-job -w

### View the logs from the running Job

This will download the needed artifacts and run the finetuning code, this process will take close to 30 minutes.

In [None]:
! kubectl logs -f job/finetune-job

## Find the model on Huggingface

If the Job ran successfully you can now go find the model on your Huggingface profile.

## Clean up resources

In [None]:
! kubectl delete job finetune-job

In [None]:
! kubectl delete secrets hf-secret

In [None]:
! gcloud container clusters delete {CLUSTER_NAME} \
  --region={REGION} \
  --quiet