# Training Routine

For trl, the version has to be at least 0.12. Else, the `processing_class` parameter for the `DPOTrainer` will not be available.



In [1]:
!pip install datasets
!pip install trl
!pip install -U transformers
!pip install accelerate
!pip install peft
!pip install -U bitsandbytes
!pip install flash-attn

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [1]:
import os

from datasets import load_dataset, DatasetDict
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

import torch
from google.colab import userdata, drive

In [2]:
# Limit reserved but unallocated memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Empty memory
torch.cuda.empty_cache()

## Load Data

In [3]:
# This is only needed if the data gets loaded from google drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Load dataset from files
dataset = load_dataset( 'parquet',
    data_files={
        'train':    '/content/drive/MyDrive/practical_course2/data/agent_train.parquet',
        'test':     '/content/drive/MyDrive/practical_course2/data/agent_test.parquet'
    }
)

# Create train-test split
train_split = dataset['train'].train_test_split(test_size=0.2)
dataset['train'] = train_split['train']
dataset['eval'] = train_split['test']

In [5]:
# Just to check whether everything works
small_dataset = DatasetDict({
    'train': dataset['train'].select(range(100)),
    'test': dataset['test'].select(range(20)),
    'eval': dataset['eval'].select(range(20))
})

In [19]:
small_dataset['train'][0]['prompt']

[{'content': 'A persona description is a string describing a set of characteristics an LLM adopts when generating responses. Given the following task, generate a persona description that will answer it as good as possible:\n Consider a thoroughly mixed vessel where a salt is dis-solved inwater. The volume of the fresh water initially in the tank is 100lbm.The inlet conditions are,ṁ_w= 150 lb/hr. and ṁ_s= 30 lb/hr. The resulting solution leaves at a rate of 120 lb/hr. If the flow in and out remain constant, compute the outletconcentration after one hour.\n(A) 0.86\n(B) 0.76\n(C) 0.46\n(D) 0.16\n(E) 0.06\n(F) 0.26\n(G) 0.96\n(H) 0.56\n(I) 0.36\n(J) 0.66',
  'role': 'user'}]

In [8]:
# Need to run cell below first to initialize tokenizer
# import numpy as np

# for col in ['prompt', 'chosen', 'rejected']:
#     prompts = dataset["train"][col]

#     lengths = [
#         len(tokenizer.apply_chat_template(p, tokenize=True))
#         for p in prompts
#     ]

#     print(f"Max {col} length:", max(lengths))
#     print(f"Average {col} length:", sum(lengths) / len(lengths))
#     print(f"Median {col} length:", np.median(lengths))
#     print(f"95% of {col} are shorter than:", np.percentile(lengths, 95))

Max prompt length: 7921
Average prompt length: 279.74698918844945
Median prompt length: 198.0
95% of prompt are shorter than: 542.0
Max chosen length: 544
Average chosen length: 47.5803852470234
Median chosen length: 35.0
95% of chosen are shorter than: 109.0
Max rejected length: 525
Average rejected length: 51.767004242507184
Median rejected length: 36.0
95% of rejected are shorter than: 115.0


Max prompt length: 7921 \\
Average prompt length: 279.74698918844945 \\
Median prompt length: 198.0 \\
95% of prompt are shorter than: 542.0 \\
Max chosen length: 544 \\
Average chosen length: 47.5803852470234 \\
Median chosen length: 35.0 \\
95% of chosen are shorter than: 109.0 \\
Max rejected length: 525 \\
Average rejected length: 51.767004242507184 \\
Median rejected length: 36.0 \\
95% of rejected are shorter than: 115.0 \\

## Train

In [6]:
# https://huggingface.co/docs/peft/en/developer_guides/quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bmb_4bit_quant_storage=torch.bfloat16,
)

# Load base model
model_name = 'Qwen/Qwen2-0.5B-Instruct'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    # load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    trust_remote_code=True,
    token=userdata.get("HF_TOKEN"),
    attn_implementation="flash_attention_2"
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

# https://huggingface.co/docs/trl/main/en/dpo_trainer#using-option-3---load-the-adapter-twice
# Load adapter
# model = PeftModel.from_pretrained(
#     model,
#     './peft/' + model_name,
#     is_trainable=True,
#     adapter_name='train'
# )

# model.load_adapter('./peft/' + model_name, adapter_name='reference')

# Since we are using flash attention, padding on the right side might have unexpected
# imapacts during training
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side='left'
)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

In [7]:
# LoRA configuration
lora_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"],
)
model = get_peft_model(model, lora_config)

In [8]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params}")

Trainable params: 540672


In [9]:
# Most of these values are taken from Exercise 3 of the DL4NLP course
BATCH_SIZE = 2
training_args = DPOConfig(
    output_dir='./logs/' + model_name,
    logging_dir='./logs/' + model_name,
    per_device_train_batch_size=2 * BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    learning_rate=2e-5,
    weight_decay=1e-3,
    num_train_epochs=3,
    optim='adamw_bnb_8bit',
    logging_strategy='steps',
    logging_steps=len(dataset['train'])//BATCH_SIZE,
    save_strategy='epoch',
    eval_strategy='epoch',
    save_total_limit=1,
    seed=42,
    data_seed=42,
    fp16=True,
    dataloader_num_workers=2,
    load_best_model_at_end=True,
    report_to='tensorboard',
    max_prompt_length=768,          # Empirically chosen (see above)
    max_length=1024,                # Empirically chosen (see above)
    label_names=[]                  # Surpress warning no label_names provided
)

In [10]:
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=small_dataset['train'],
    eval_dataset=small_dataset['eval'],
)

Extracting prompt in train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
trainer.train()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.697045,-0.046583,-0.039911,0.45,-0.006672,-137.448517,-128.211258,-2.601793,-2.622021
2,No log,0.695032,-0.047323,-0.04518,0.5,-0.002143,-137.455933,-128.263947,-2.604558,-2.623073




TrainOutput(global_step=18, training_loss=0.621158070034451, metrics={'train_runtime': 45.7208, 'train_samples_per_second': 6.562, 'train_steps_per_second': 0.394, 'total_flos': 0.0, 'train_loss': 0.621158070034451, 'epoch': 2.64})

- Test 100/20/20 samples, Qwen 0.5B, BATCH_SIZE = 2: 25.5 GB
- Test 100/20/20 samples, Qwen 0.5B, BATCH_SIZE = 4: 34.6 GB
- Test 100/20/20 samples, Qwen 0.5B, BATCH_SIZE = 32: OOM

Limit max_prompt_length to 768 (from 1024) and max_length to 1024 (from 2048):

- Test 100/20/20 samples, Qwen 0.5B, BATCH_SIZE = 4: 18.4 GB

## Inference

In [None]:
save_path = './peft/' + model_name
trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    return_dict=True,
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

new_model = PeftModel.from_pretrained(base_model, save_path)
new_model = new_model.merge_and_unload()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
pipe = pipeline(
    'text-generation',
    model=model,
    tokenizer=tokenizer,
)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausa

In [None]:
prompt = tokenizer.apply_chat_template(small_dataset['train'][100]['prompt'], tokenize=False, add_generation_prompt=True)
print(prompt)

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
A persona description is a string describing a set of characteristics an LLM adopts when generating responses. Given the following task, generate a persona description that will answer it as good as possible:
 How would a typical person answer each of the following questions about causation?


Q: Eugene and Tina were a young married couple who lived in the country. Both were partially paralyzed and confined to wheelchairs. They had met four years before when Tina was a counsellor with the Canadian Paraplegic Association, had fallen in love, and were married one year later. On this particular evening, Eugene had phoned to request a cab to take them downtown. When the taxi driver arrived, Eugene and Tina were waiting by the street. On seeing that they were both in wheelchairs, the taxi driver refused their fare because he thought it would be too crowded in the taxi with both of them and the wheelchairs. So the tax

In [None]:
output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, num_return_sequences=1, max_new_tokens=400)

In [None]:
output[0]['generated_text']

"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nA persona description is a string describing a set of characteristics an LLM adopts when generating responses. Given the following task, generate a persona description that will answer it as good as possible:\n How would a typical person answer each of the following questions about causation?\n\n\nQ: Eugene and Tina were a young married couple who lived in the country. Both were partially paralyzed and confined to wheelchairs. They had met four years before when Tina was a counsellor with the Canadian Paraplegic Association, had fallen in love, and were married one year later. On this particular evening, Eugene had phoned to request a cab to take them downtown. When the taxi driver arrived, Eugene and Tina were waiting by the street. On seeing that they were both in wheelchairs, the taxi driver refused their fare because he thought it would be too crowded in the taxi with both of them and the wheelchairs. So