In [1]:
%pip install wandb datasets transformers trl torch peft

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting n

In [2]:
import wandb
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
import torch
from peft import LoraConfig, get_peft_model

In [3]:
# 모델과 토크나이저 로드
model_name = "facebook/opt-350m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/662M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [4]:
# 데이터셋 로드
# 현재 train 데이터를 학습/테스트 세트로 분할
dataset = load_dataset("sahil2801/CodeAlpaca-20k", split="train")
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)  # 10%를 테스트 세트로 분할

train_dataset = dataset_split["train"]
test_dataset = dataset_split["test"]

print("=== train 예시 ===")
print(train_dataset[0])
print("=== test 예시 ===")
print(test_dataset[0])

README.md:   0%|          | 0.00/147 [00:00<?, ?B/s]

code_alpaca_20k.json:   0%|          | 0.00/8.06M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20022 [00:00<?, ? examples/s]

=== train 예시 ===
{'output': 'class ArraySort { \n  \n    void sort(int arr[]) { \n        int n = arr.length; \n  \n        // One by one move boundary of unsorted subarray \n        for (int i = 0; i < n-1; i++) { \n            \n            // Find the minimum element in unsorted array \n            int min_index = i; \n            for (int j = i+1; j < n; j++) \n                if (arr[j] < arr[min_index]) \n                    min_index = j; \n  \n            // Swap the found minimum element with the first element \n            int temp = arr[min_index]; \n            arr[min_index] = arr[i]; \n            arr[i] = temp; \n        } \n    } \n  \n    // Prints the array \n    void printArray(int arr[]) { \n        int n = arr.length; \n        for (int i=0; i<n; ++i) \n            System.out.print(arr[i] + " "); \n        System.out.println(); \n    } \n  \n    // Driver code to test above \n    public static void main(String args[]) \n    { \n        ArraySort ob = new ArraySort(

In [5]:
# 프롬프트 포매팅 함수 정의
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        # input이 비어있는 경우 처리
        input_text = example['input'][i].strip() if example['input'][i] else ""

        # 형식화된 프롬프트 생성
        text = f"[Instruction]\n{example['instruction'][i].strip()}\n\n"
        if input_text:
            text += f"[Input]\n{input_text}\n\n"
        text += f"[Output]\n{example['output'][i].strip()}"

        output_texts.append(text)
    return output_texts

# 데이터 콜레이터 설정
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [6]:
def train_with_lora(lora_r, model_name, train_dataset, test_dataset, formatting_func, data_collator):
    """
    주어진 LoRA rank로 모델을 학습하고 평가하는 함수

    Args:
        lora_r (int): LoRA rank 값
        model_name (str): 기본 모델 이름
        train_dataset: 학습 데이터셋
        test_dataset: 평가 데이터셋
        formatting_func: 프롬프트 포매팅 함수
        data_collator: 데이터 콜레이터
    """
    wandb.init(
        project='Hanghae99-8-basic',
        name=f'gpt-finetuning-with-lora-r{lora_r}',
        reinit=True
    )

    print(f"\n=== Training with LoRA rank {lora_r} ===")

    # 기본 모델 로드
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # LoRA 설정
    lora_config = LoraConfig(
        r=lora_r,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # LoRA 적용
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # SFTTrainer 설정 및 학습
    trainer = SFTTrainer(
        model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        args=SFTConfig(
            output_dir=f"/tmp/clm-instruction-tuning-lora-{lora_r}",
            max_seq_length=128,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_steps=100,
            num_train_epochs=3,
            learning_rate=2e-5,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss"
        ),
        formatting_func=formatting_func,
        data_collator=data_collator,
    )

    # 학습 시작
    train_result = trainer.train()
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)

    # 메모리 사용량 출력
    if torch.cuda.is_available():
        print('Max Alloc:', round(torch.cuda.max_memory_allocated(0)/1024**3, 1), 'GB')

    # 모델 저장
    trainer.save_model()

    # CUDA 캐시 정리
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    wandb.finish()

In [7]:
train_with_lora(
    lora_r=8,
    model_name=model_name,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=data_collator
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mknospe1[0m ([33mknospe1-gaeun[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



=== Training with LoRA rank 8 ===
trainable params: 786,432 || all params: 331,982,848 || trainable%: 0.2369


Applying formatting function to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Applying formatting function to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/18019 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/18019 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/2003 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2003 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.6631,1.622599
2,1.5981,1.575313
3,1.6126,1.561759


***** train metrics *****
  total_flos               = 11700389GF
  train_loss               =     1.6888
  train_runtime            = 0:50:25.52
  train_samples_per_second =     17.867
  train_steps_per_second   =      2.234
Max Alloc: 3.1 GB


0,1
eval/loss,█▃▁
eval/mean_token_accuracy,▁▆█
eval/num_tokens,▁▅█
eval/runtime,█▁▂
eval/samples_per_second,▁█▇
eval/steps_per_second,▁█▇
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
train/grad_norm,▁▂▁▂▃▃▃▄▄▃▄▄▃▃▅▅▄▄▅▄▄▆▇▅▅▅▆█▆▆▅▅█▆▅▆▅▇▅▅
train/learning_rate,████▇▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁

0,1
eval/loss,1.56176
eval/mean_token_accuracy,0.66691
eval/num_tokens,4976886.0
eval/runtime,50.9211
eval/samples_per_second,39.335
eval/steps_per_second,4.929
total_flos,1.2563197176840192e+16
train/epoch,3.0
train/global_step,6759.0
train/grad_norm,3.68428


In [None]:
train_with_lora(
    lora_r=128,
    model_name=model_name,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=data_collator
)

In [None]:
train_with_lora(
    lora_r=256,
    model_name=model_name,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=data_collator
)