# Fine-tune Gemma 2b it in Google Colab


In [None]:
# Install virtualenv package
!pip install -q virtualenv

# Create a new virtual environment
!virtualenv finetuninggemmabase

In [None]:
# Upgrade and install required libraries
# Python 3.10.12
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.2
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.1
!pip3 install -q -U huggingface_hub
!pip3 install -q -U py7zr

In [3]:
# Activate the virtual environment
!source finetuninggemmabase/bin/activate

In [4]:
import os
import torch
import gc
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import huggingface_hub
from google.colab import userdata

In [5]:
def clear_memory():
    """Clear GPU memory and collect garbage."""
    torch.cuda.empty_cache()
    gc.collect()

# Clear memory before starting
clear_memory()

In [6]:
# Retrieve and set up Hugging Face token
hf_token = userdata.get("HF_TOKEN")
os.environ["WANDB_DISABLED"] = "true"

In [7]:
MODEL_ID = "google/gemma-2b-it"
REPO_NAME = "nguyenanhhuy248/gemma-2b-it-samsum"

In [None]:
# Load and configure the model with quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map={"": 0},
    token=hf_token
)
tokenizer.padding_side = "right"

In [None]:
# Load and preprocess the dataset
dataset = load_dataset("samsum", split="train", trust_remote_code=True)
dataset = dataset.map(lambda samples: tokenizer(samples["dialogue"]), batched=True)

In [15]:
dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask'],
    num_rows: 14732
})

In [16]:
# Create LoRA configuration for the model
lora_config = LoraConfig(
    r=8,
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    task_type="CAUSAL_LM",
)

In [None]:
# Function to format examples for training
def formatting_func(example):
    text = f"Dialogue: {example['dialogue'][0]}\nSummary: {example['summary'][0]}"
    return [text]

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=2048,
)

In [None]:
# Train the model
trainer.train()

In [23]:
# Save trained model
trainer.model.save_pretrained("gemma-2b-it-samsum")

In [24]:
# Clean up
del model
del trainer
clear_memory()

In [None]:
# Reload model in FP16 and merge with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
model = PeftModel.from_pretrained(base_model, "gemma-2b-it-samsum")
model = model.merge_and_unload()

In [26]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Login to Hugging Face hub and push model and tokenizer
huggingface_hub.login(token=hf_token)
tokenizer.push_to_hub(REPO_NAME, use_temp_dir=False)
model.push_to_hub(REPO_NAME, use_temp_dir=False)

In [None]:
# Zip up the model folder
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!zip -r /content/gemma-2b-it-samsum.zip /content/gemma-2b-it-samsum

In [None]:
# Copy model to Google Drive
from google.colab import drive
drive.mount('/content/drive')
!cp /content/gemma-2b-it-samsum.zip /content/drive/MyDrive/gemma-2b-it-samsum.zip