In [2]:
from unsloth import FastModel
import torch
from datasets import load_dataset

from trl import SFTTrainer, SFTConfig

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = False,
    load_in_8bit = False,
    full_finetuning = False,
    dtype=torch.bfloat16
    # token = "hf_...",
)

tokenizer.pad_token = tokenizer.eos_token

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: LoRA, QLoRA and full finetuning all not selected. Switching to QLoRA.
==((====))==  Unsloth 2025.3.14: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True, 
    finetune_attention_modules = True,
    finetune_mlp_modules       = True, 

    r = 8,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_gradient_checkpointing = True
)

In [4]:
dataset = load_dataset("HuggingFaceH4/Bespoke-Stratos-17k", split="train")

In [5]:
def format_restruction(dataset):
    system = dataset['system']
    question = dataset['conversations'][0]['value']
    response = dataset['conversations'][1]['value']
    
    return {
        "system": system,
        "question": question,
        "response": response
    }

In [6]:
restructured_dataset = dataset.map(
    format_restruction,
    remove_columns=dataset.column_names,  # 기존 컬럼 제거
)

In [7]:
def filter_long_tokens(dataset):
    return len(tokenizer.tokenizer(dataset["system"])['input_ids']) + len(tokenizer.tokenizer(dataset["question"])['input_ids']) + len(tokenizer.tokenizer(dataset["response"])['input_ids']) <= 2044

In [8]:
filtered_dataset = restructured_dataset.filter(filter_long_tokens)

In [9]:
# # 토큰 수를 저장할 리스트
# token_lengths = []

# # 데이터셋의 모든 텍스트에 대해 토큰 수 계산
# for idx, text in enumerate(filtered_dataset):
#     num_tokens = len(tokenizer.tokenizer(text['system'])['input_ids']) + len(tokenizer.tokenizer(text['question'])['input_ids']) + len(tokenizer.tokenizer(text['response'])['input_ids'])
#     token_lengths.append((num_tokens, idx))

# # 최대 및 최소 토큰 수를 가진 데이터의 인덱스 찾기
# max_token_info = max(token_lengths, key=lambda x: x[0])  # 토큰 수가 최대인 데이터
# min_token_info = min(token_lengths, key=lambda x: x[0])  # 토큰 수가 최소인 데이터

# # 최대 및 최소 토큰 수를 가진 데이터 출력
# max_tokens, max_idx = max_token_info
# min_tokens, min_idx = min_token_info

# print(f"최대 토큰 수: {max_tokens}")
# print(f"최대 토큰 수를 가진 데이터: {filtered_dataset[max_idx]}, idx: {max_idx}")
# print(f"최소 토큰 수: {min_tokens}")
# print(f"최소 토큰 수를 가진 데이터: {filtered_dataset[min_idx]}, idx: {min_idx}")

In [10]:
def preprocess_function(dataset):
    # 입력 텍스트 준비 (시스템 프롬프트 + 질문)
    inputs = [f"{system}\n\n{question}" 
             for system, question in zip(dataset['system'], dataset['question'])]
    
    # 출력 텍스트 준비 (응답)
    outputs = [f"{response}" for response in dataset['response']]
    
    # 입력 토큰화
    model_inputs = tokenizer(inputs, 
                           max_length=2048, 
                           truncation=True,
                           padding="max_length")
    
    # 라벨(출력) 토큰화
    labels = tokenizer(outputs,
                      max_length=2048,
                      truncation=True,
                      padding="max_length")
    
    # 라벨 ID를 모델 입력에 추가
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [11]:
tokenized_dataset = filtered_dataset.map(preprocess_function, batched=True, remove_columns=restructured_dataset.column_names)

Map:   0%|          | 0/3641 [00:00<?, ? examples/s]

In [12]:
tokenized_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3641
})

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = tokenized_dataset,
    eval_dataset = None,
    args = SFTConfig(
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_ratio = 0.1,
        num_train_epochs = 1,
        # max_steps = 50, # test only
        learning_rate = 2.0e-5,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        save_steps = 50,
        save_total_limit= 5,
        seed = 3407,
        report_to = "wandb",
        # packing=True, # unsloth paking 버그로 인한 비활성화
        output_dir = "outputs",
    ),
)

In [14]:
trainer.train(resume_from_checkpoint=None)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,641 | Num Epochs = 1 | Total steps = 113
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 8 x 1) = 32
 "-____-"     Trainable parameters = 14,901,248/4,000,000,000 (0.37% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mhiyo2044[0m ([33mhiyo2044-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


RuntimeError: CUDA driver error: out of memory