In [2]:
%pip install wandb datasets transformers trl torch peft

Collecting wandb
  Using cached wandb-0.19.11-py3-none-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Using cached trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Using cached peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Using cached docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting sentry-sdk>=2.0.0 (from wandb)
  Downloading sentry_sdk-2.29.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-an

In [3]:
import wandb
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
import torch
from peft import LoraConfig, get_peft_model

In [4]:
# 모델과 토크나이저 로드
model_name = "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
# 데이터셋 로드
# 현재 train 데이터를 학습/테스트 세트로 분할
dataset = load_dataset("sahil2801/CodeAlpaca-20k", split="train")
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)  # 10%를 테스트 세트로 분할

train_dataset = dataset_split["train"]
test_dataset = dataset_split["test"]

print("=== train 예시 ===")
print(train_dataset[0])
print("=== test 예시 ===")
print(test_dataset[0])

=== train 예시 ===
{'output': 'class ArraySort { \n  \n    void sort(int arr[]) { \n        int n = arr.length; \n  \n        // One by one move boundary of unsorted subarray \n        for (int i = 0; i < n-1; i++) { \n            \n            // Find the minimum element in unsorted array \n            int min_index = i; \n            for (int j = i+1; j < n; j++) \n                if (arr[j] < arr[min_index]) \n                    min_index = j; \n  \n            // Swap the found minimum element with the first element \n            int temp = arr[min_index]; \n            arr[min_index] = arr[i]; \n            arr[i] = temp; \n        } \n    } \n  \n    // Prints the array \n    void printArray(int arr[]) { \n        int n = arr.length; \n        for (int i=0; i<n; ++i) \n            System.out.print(arr[i] + " "); \n        System.out.println(); \n    } \n  \n    // Driver code to test above \n    public static void main(String args[]) \n    { \n        ArraySort ob = new ArraySort(

In [7]:
# 프롬프트 포매팅 함수 정의
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        # input이 비어있는 경우 처리
        input_text = example['input'][i].strip() if example['input'][i] else ""
        
        # 형식화된 프롬프트 생성
        text = f"[Instruction]\n{example['instruction'][i].strip()}\n\n"
        if input_text:
            text += f"[Input]\n{input_text}\n\n"
        text += f"[Output]\n{example['output'][i].strip()}"
        
        output_texts.append(text)
    return output_texts

# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [9]:
def train_with_lora(lora_r, model_name, train_dataset, test_dataset, formatting_func, data_collator):
    """
    주어진 LoRA rank로 모델을 학습하고 평가하는 함수
    
    Args:
        lora_r (int): LoRA rank 값
        model_name (str): 기본 모델 이름
        train_dataset: 학습 데이터셋
        test_dataset: 평가 데이터셋
        formatting_func: 프롬프트 포매팅 함수
        data_collator: 데이터 콜레이터
    """
    wandb.init(
        project='Hanghae99-8-basic',
        name=f'gpt-finetuning-with-lora-r{lora_r}',
        reinit=True
    )
    
    print(f"\n=== Training with LoRA rank {lora_r} ===")
    
    # 기본 모델 로드
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    # LoRA 설정
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # LoRA 적용
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    # SFTTrainer 설정 및 학습
    trainer = SFTTrainer(
        model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        args=SFTConfig(
            output_dir=f"/tmp/clm-instruction-tuning-lora-{lora_r}",
            max_seq_length=128,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_steps=100,
            num_train_epochs=3,
            learning_rate=2e-5,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss"
        ),
        formatting_func=formatting_func,
        data_collator=data_collator,
    )

    # 학습 시작
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    
    # 메모리 사용량 출력
    if torch.cuda.is_available():
        print('Max Alloc:', round(torch.cuda.max_memory_allocated(0)/1024**3, 1), 'GB')
    
    # 모델 저장
    trainer.save_model()
    
    # CUDA 캐시 정리
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    wandb.finish()

In [10]:
train_with_lora(
    lora_r=8,
    model_name=model_name,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=data_collator
)

0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁
train/mean_token_accuracy,▁
train/num_tokens,▁

0,1
train/epoch,0.04439
train/global_step,100.0
train/grad_norm,2.09308
train/learning_rate,2e-05
train/loss,2.2318
train/mean_token_accuracy,0.57124
train/num_tokens,75132.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



=== Training with LoRA rank 8 ===
trainable params: 786,432 || all params: 331,982,848 || trainable%: 0.2369


Applying formatting function to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [8]:
# 여러 lora_r 값으로 실험
lora_r_values = [8, 128, 256]

for lora_r in lora_r_values:
    train_with_lora(
        lora_r=lora_r,
        model_name=model_name,
        train_dataset=train_dataset,
        test_dataset=test_dataset,
        formatting_func=formatting_prompts_func,
        data_collator=data_collator
    )

[34m[1mwandb[0m: Currently logged in as: [33mknospe1[0m ([33mknospe1-gaeun[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



=== Training with LoRA rank 8 ===
trainable params: 786,432 || all params: 331,982,848 || trainable%: 0.2369


Applying formatting function to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Applying formatting function to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/18019 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/2003 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 