In [1]:
!pip install trl accelerate datasets

Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)

In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel
from transformers import Trainer, TrainingArguments
from copy import deepcopy
import copy
import logging
from dataclasses import dataclass
import datasets
import gc

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig

In [3]:
dpo_dataset1 = datasets.load_from_disk('/content/drive/MyDrive/rm_train_dataset2')
dpo_dataset2 = datasets.load_from_disk('/content/drive/MyDrive/rm_val_dataset2')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("skt/ko-gpt-trinity-1.2B-v0.5")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
model_name="/content/drive/MyDrive/merged_model"

dpo_model = AutoModelForCausalLM.from_pretrained(model_name)

In [6]:
ref_model = copy.deepcopy(dpo_model)
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

In [7]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["c_attn", "c_proj"],
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

dpo_model = get_peft_model(dpo_model, lora_config)
dpo_model.print_trainable_parameters()

trainable params: 8,110,080 || all params: 1,170,666,240 || trainable%: 0.6928




In [12]:
dpo_dataset1

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 13395
})

In [8]:
config = DPOConfig(
    do_train=True,
    do_eval=True,
    beta=0.1,  # KL 강도 조절
    learning_rate=5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3, #3
    max_length=1024,
    gradient_accumulation_steps=8,
    logging_steps=100,
    save_strategy = 'steps',
    save_steps=100,
    eval_steps=100,
    eval_strategy = 'steps',
    overwrite_output_dir=True,
    bf16=True,
    report_to ="tensorboard",
    logging_dir="/content/drive/MyDrive/DPO/logs",
    output_dir="/content/drive/MyDrive/DPO",
    seed=42
)

In [9]:
trainer = DPOTrainer(
    model=dpo_model,
    ref_model=ref_model,
    args=config,
    train_dataset=dpo_dataset1,
    eval_dataset=dpo_dataset2,  # optional
    processing_class=tokenizer
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
100,0.6379,0.557867,-0.026978,-0.369698,0.885389,0.34272,-72.073982,-43.672161,-3.83971,-3.833713
200,0.4559,0.367927,-0.383929,-1.87117,0.876676,1.487241,-75.643486,-58.686882,-3.812772,-3.800094
300,0.3361,0.335859,-0.62917,-2.711612,0.879357,2.082441,-78.095901,-67.091293,-3.791063,-3.776935
400,0.329,0.317881,-0.828216,-3.123915,0.883378,2.295699,-80.086365,-71.214333,-3.785363,-3.770882
500,0.2937,0.304174,-1.059431,-3.602906,0.89008,2.543475,-82.398514,-76.00425,-3.781545,-3.766575
600,0.284,0.293065,-1.304298,-4.116154,0.892091,2.811856,-84.847183,-81.136711,-3.777839,-3.761647
700,0.2854,0.284571,-1.480434,-4.411752,0.895442,2.931318,-86.608536,-84.092697,-3.769323,-3.751848
800,0.2684,0.278176,-1.635026,-4.728628,0.899464,3.093602,-88.154457,-87.261467,-3.75877,-3.739254
900,0.2524,0.273039,-1.687129,-4.854206,0.899464,3.167076,-88.675499,-88.517227,-3.753438,-3.733922
1000,0.2698,0.269525,-1.773489,-5.004761,0.898794,3.231272,-89.539085,-90.022789,-3.748755,-3.729238


TrainOutput(global_step=1257, training_loss=0.32190150349692115, metrics={'train_runtime': 2190.9736, 'train_samples_per_second': 18.341, 'train_steps_per_second': 0.574, 'total_flos': 0.0, 'train_loss': 0.32190150349692115, 'epoch': 3.0})

In [11]:
dpo_model = dpo_model.merge_and_unload()

dpo_model.save_pretrained("/content/drive/MyDrive/DPO/DPO_result")

rm dataset 변경

In [8]:
import json
from datasets import Dataset

with open("/content/drive/MyDrive/RMlabel.json", "r", encoding="utf-8-sig") as f:
    data = json.load(f)

dpo_pairs = []
PROMPT_TEMPLATE = "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
for item in data.get("data_info", []):
  prompt = item.get("question", "")

  prompt = PROMPT_TEMPLATE.format(prompt=prompt)

  ranking_map = {}
  for key, val in item.items():
      if key.startswith("answer") and isinstance(val, dict):
          rank = val.get("ranking")
          content = val.get("contents")
          if isinstance(rank, int) and content:
              ranking_map[rank] = content

  if 1 in ranking_map and 4 in ranking_map:
      dpo_pairs.append({
          "prompt": prompt,
          "chosen": ranking_map[1],
          "rejected": ranking_map[4]
      })

  if 2 in ranking_map and 5 in ranking_map:
      dpo_pairs.append({
          "prompt": prompt,
          "chosen": ranking_map[2],
          "rejected": ranking_map[5]
      })

dpo_dataset = Dataset.from_list(dpo_pairs)

dataset_split = dpo_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

In [9]:
config = DPOConfig(
    do_train=True,
    do_eval=True,
    beta=0.1,  # KL 강도 조절
    learning_rate=2e-6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    max_length=1024,
    gradient_accumulation_steps=16,
    logging_steps=500,
    save_strategy = 'steps',
    save_steps=500,
    eval_steps=500,
    eval_strategy = 'steps',
    overwrite_output_dir=True,
    bf16=True,
    report_to ="tensorboard",
    logging_dir="/content/drive/MyDrive/DPO3/logs",
    output_dir="/content/drive/MyDrive/DPO3",
    seed=42
)

In [10]:
trainer = DPOTrainer(
    model=dpo_model,
    ref_model=ref_model,
    args=config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # optional
    processing_class=tokenizer
)

Extracting prompt in train dataset:   0%|          | 0/47534 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/47534 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/47534 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/5282 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5282 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5282 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,0.6765,0.668049,0.338999,0.238805,0.58103,0.100194,-475.013397,-461.434631,-3.958441,-3.959242
1000,0.6654,0.660763,0.568919,0.436519,0.591443,0.1324,-472.714172,-459.45755,-3.967578,-3.967971
1500,0.6506,0.657058,0.748188,0.585598,0.599016,0.162589,-470.921478,-457.966675,-3.960371,-3.960599
2000,0.6101,0.65572,0.597992,0.410676,0.599205,0.187316,-472.423431,-459.715912,-3.980295,-3.98027
2500,0.6082,0.654673,0.665093,0.466179,0.596176,0.198914,-471.752472,-459.160889,-3.965773,-3.965657
3000,0.5993,0.65517,0.785671,0.588765,0.599584,0.196905,-470.546661,-457.935059,-3.973156,-3.972755
3500,0.5559,0.662547,0.715215,0.500204,0.588224,0.215011,-471.25119,-458.820679,-3.977344,-3.976879
4000,0.5569,0.658941,0.621072,0.388319,0.589739,0.232754,-472.192627,-459.939514,-3.965432,-3.96494
4500,0.5482,0.660987,0.655523,0.411106,0.585195,0.244417,-471.848175,-459.71167,-3.961282,-3.960797
5000,0.5079,0.670899,0.5411,0.291809,0.585006,0.24929,-472.992371,-460.904572,-3.948291,-3.947986


KeyboardInterrupt: 

학습이 전혀되지 않고 있음. 과적합이 보이며 margin은 증가하지만 chosen reward도 감소하는데 rejected reward가 더 감소해서 marigin은 증가하는 것 뿐임
chosen 을 생성하는 능력도 떨어지고 rejected를 생성하는 능력이 더 떨어져서 덜 나쁜 문장을 생성하고 있음.

In [11]:
trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
500,0.6888,0.681841,0.24295,0.215435,0.584438,0.027515,-475.975433,-448.765045,-3.970645,-3.973126
1000,0.6731,0.669778,0.533606,0.466273,0.604506,0.067333,-473.068878,-446.256622,-3.969343,-3.971817
1500,0.6625,0.663734,0.630106,0.535572,0.615865,0.094534,-472.103851,-445.563721,-3.969526,-3.972066
2000,0.6509,0.660527,0.695564,0.579762,0.610943,0.115802,-471.449249,-445.121796,-3.968972,-3.971576
2500,0.6494,0.65829,0.738042,0.606764,0.613025,0.131278,-471.024506,-444.851807,-3.967916,-3.970494
3000,0.6445,0.657132,0.78341,0.642897,0.616055,0.140512,-470.570862,-444.490448,-3.968258,-3.970796
3500,0.6396,0.656696,0.825239,0.67704,0.614729,0.148199,-470.152557,-444.149048,-3.968682,-3.971194
4000,0.6369,0.655989,0.825626,0.670167,0.613972,0.155459,-470.148712,-444.217712,-3.968488,-3.970987
4500,0.6365,0.654571,0.840573,0.679085,0.613593,0.161487,-469.999207,-444.12854,-3.969132,-3.971571
5000,0.6314,0.654431,0.854662,0.69007,0.617569,0.164592,-469.858307,-444.018707,-3.969935,-3.972344


TrainOutput(global_step=7430, training_loss=0.6438399714063026, metrics={'train_runtime': 25737.0245, 'train_samples_per_second': 9.235, 'train_steps_per_second': 0.289, 'total_flos': 0.0, 'train_loss': 0.6438399714063026, 'epoch': 5.0})

In [12]:
dpo_model = dpo_model.merge_and_unload()

dpo_model.save_pretrained("/content/drive/MyDrive/DPO3/DPO_result")

generation

In [None]:
beam_generation_args = dict(
    num_beams=5,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    eos_token_id=1,
    max_new_tokens=256,
    do_sample=True,
    top_k=40,
    early_stopping=True,
    temperature = 0.7
)

In [None]:
import random

#generator1 = pipeline('text-generation', model='/content/drive/MyDrive/merged_model', tokenizer=tokenizer) #sft
generator2 = pipeline('text-generation', model='/content/drive/MyDrive/PPO/PPO_result', tokenizer=tokenizer) #ppo
generator3 = pipeline('text-generation', model="/content/drive/MyDrive/DPO/DPO_result", tokenizer=tokenizer) #dpo
generator4 = pipeline('text-generation', model="/content/drive/MyDrive/DPO2/DPO_result", tokenizer=tokenizer) #new dpo

In [None]:
torch.cuda.empty_cache()

PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = random.sample(dpo_dataset2['prompt'], 2)
list_prompt2 = random.sample(eval_dataset['prompt'], 2)

list_prompt.extend(list_prompt2)

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt' : tmp}) for tmp in list_prompt]

list_result = generator2(list_prompt, **beam_generation_args)
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))
print('\n', 'which one is better?')
list_result = generator3(list_prompt, **beam_generation_args)
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))
print('\n', 'which one is better?')
list_result = generator4(list_prompt, **beam_generation_args)
for prompt, result in zip(list_prompt, list_result):
    print()
    print((result[0]['generated_text']))

분석중.

beta, lopgrejected, accuracy, capacity and loss landscape relation