# Supervised Fine-Tuning
Gemma-2-9b-it-tokenizer2 (Openai-style compatible for lm eval harness) with CoT

In [14]:
import os
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    BitsAndBytesConfig, 
    TrainingArguments, 
    logging
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, setup_chat_format
import bitsandbytes as bnb

import sys
sys.path.append("/data/lucasjia/projects/assignment1/arc/SFT")
from sft import *

torch.cuda.set_device(9)
device = {"": torch.cuda.current_device()}
print(device)

{'': 9}


In [10]:
def load_model(model_path = "/data/lucasjia/models/gemma-2-9b-it-tokenizer2"):    
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        attn_implementation="eager"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    modules = find_all_linear_names(model)

    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=modules
    )

    tokenizer.chat_template = None

    model, tokenizer = setup_chat_format(model, tokenizer)
    model = get_peft_model(model, peft_config)

    return model, tokenizer, peft_config

In [16]:
def get_train_args(output_dir="/data/lucasjia/models/gemma-2-9b-it-SFT-test"):
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,          
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,         
        optim="paged_adamw_32bit",
        num_train_epochs=1,                    
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=100,                          
        save_steps=100,
        logging_steps=10,                      
        warmup_steps=10,                        
        logging_strategy="steps",
        learning_rate=2e-4,
        fp16=False,
        bf16=True,
        group_by_length=True,
        load_best_model_at_end=False,          
        save_total_limit=1,    
        ddp_find_unused_parameters=False,
        remove_unused_columns=False,           
    )
    return training_args

In [11]:
print(torch.cuda.device_count())
print(torch.cuda.current_device())

output_dir = "/data/lucasjia/models/gemma-2-9b-it-SFT"

model, tokenizer, peft_config = load_model()
model.config.use_cache = False
model.train()  # Make sure model is in training mode

if hasattr(model, 'hf_device_map'):
    print("Device map:", model.hf_device_map)


10
9


Loading checkpoint shards: 100%|██████████| 8/8 [00:13<00:00,  1.73s/it]


Device map: {'': 9}


In [17]:
train_data, val_data = load_training_data(limit=8000)
training_args = get_train_args(output_dir)

# def formatting_func(example):
#     return tokenizer.apply_chat_template(example["messages"], tokenize=False)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    peft_config=peft_config,
    args=training_args,
    # dataset_text_field="text", 
    # tokenizer=tokenizer,
    # packing=False,
    # formatting_func=formatting_func,
)

trainer.accelerator.device = torch.device("cuda", torch.cuda.current_device())


Converting train dataset to ChatML: 100%|██████████| 6400/6400 [00:00<00:00, 19379.13 examples/s]
Applying chat template to train dataset: 100%|██████████| 6400/6400 [00:00<00:00, 9663.99 examples/s]
Tokenizing train dataset: 100%|██████████| 6400/6400 [00:06<00:00, 979.18 examples/s] 
Truncating train dataset: 100%|██████████| 6400/6400 [00:00<00:00, 142745.33 examples/s]
Converting eval dataset to ChatML: 100%|██████████| 1600/1600 [00:00<00:00, 19887.58 examples/s]
Applying chat template to eval dataset: 100%|██████████| 1600/1600 [00:00<00:00, 9392.75 examples/s]
Tokenizing eval dataset: 100%|██████████| 1600/1600 [00:01<00:00, 981.13 examples/s] 
Truncating eval dataset: 100%|██████████| 1600/1600 [00:00<00:00, 287860.27 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used ins

AttributeError: can't set attribute 'device'

In [13]:
trainer.train()
# model.config.use_cache = True

trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

ValueError: You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`