# Using PEFT & bitsandbytes to finetune a LoRa checkpoint




In [1]:
# !pip install -q bitsandbytes datasets accelerate loralib
# !pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Setup the model

In [3]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="1"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "model_weights/13B",
    #load_in_8bit=False,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("model_weights/13B")

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/dgxuser/anaconda3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/dgxuser/Desktop/fine_tune_vicuna/venv_vicuna/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(msg)


Loading checkpoint shards:   0%|          | 0/41 [00:00<?, ?it/s]

### Freezing the original weights


In [4]:
for param in model.parameters():
# freeze the model - train adapters later
  param.requires_grad = False
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

# reduce number of stored activations
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=8, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 13107200 || all params: 13028971520 || trainable%: 0.10060041945659269


## Data

In [7]:
import transformers
from datasets import load_dataset

# data = load_dataset("json", data_files="data/UltraLLaMA.json")
# data = load_dataset("json", data_files="data/ar2558_no_keywords.json")
data = load_dataset("json", data_files="data/ar2558_whole.json")

Found cached dataset json (/home/dgxuser/.cache/huggingface/datasets/json/default-744e722f423f8092/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
data['train']

Dataset({
    features: ['question', 'answer'],
    num_rows: 751
})

In [9]:
data['train'][0]

{'question': 'What is the purpose of Army Regulation 25-58?',
 'answer': 'Army Regulation 25-58 is a regulation that outlines the information management and records management policies and procedures for the Department of the Army.'}

In [10]:
def merge_columns(example):
    example["prediction"] = example["question"] + " ->: " + example["answer"]
    return example

data['train'] = data['train'].map(merge_columns)

Loading cached processed dataset at /home/dgxuser/.cache/huggingface/datasets/json/default-744e722f423f8092/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-b66c1c2d81f0d02e.arrow


In [11]:
data = data.map(lambda samples: tokenizer(samples['question']), batched=True)

Map:   0%|          | 0/751 [00:00<?, ? examples/s]

In [12]:
encoded_dataset_train = data['train']

### Training

In [13]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=encoded_dataset_train,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=10,
#        max_steps = 499,
        num_train_epochs=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mhengdi11[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,4.2608
2,4.1892
3,3.8747
4,4.3246
5,3.9198
6,3.5095
7,3.624
8,4.3812
9,3.622
10,4.0071


TrainOutput(global_step=470, training_loss=1.2578477785942401, metrics={'train_runtime': 1173.1546, 'train_samples_per_second': 6.402, 'train_steps_per_second': 0.401, 'total_flos': 1.587374544700416e+16, 'train_loss': 1.2578477785942401, 'epoch': 10.0})

In [14]:
# model.save_pretrained("adaptors/train_13B_janna_v6_v2modify")