# Lora

## I. Presentation

The idea of lora is to modify the weight matrices using 2 smaller matrices. And the smaller matrices were represented by a layer parallel to the original matrix.

transformers > 4.34
peft > 0.5
accelerate > 0.22

## II. Example

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # or "0,1" for multiple GPUs
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
# prepare task, we use the same example as butfit

import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

ckp_data = "yahma/alpaca-cleaned"
ckp = "bigscience/bloomz-1b1"

# load dataset
data = load_dataset(ckp_data, split="train[:1000]")

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(ckp)

# process data
def process(sample):

    MAX_LEN = 256

    human = tokenizer("Human: " + "\n".join([sample["instruction"], sample["input"]]).strip() + "\n\nAssistant: ")
    ml = tokenizer(sample["output"] + tokenizer.eos_token)

    input_ids = human["input_ids"] + ml["input_ids"]
    attention_mask = human["attention_mask"] + ml["attention_mask"]
    labels = [-100] * len(human["input_ids"]) + ml["input_ids"]

    if len(input_ids) > MAX_LEN:

        input_ids = input_ids[:MAX_LEN]
        attention_mask = attention_mask[:MAX_LEN]
        labels = labels[:MAX_LEN]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# tokenize dataset
tokenized_data = data.map(process, remove_columns=data.column_names)

# load model
model = AutoModelForCausalLM.from_pretrained(ckp, low_cpu_mem_usage=True)

# send to device
if torch.cuda.is_available():
    model = model.to("cuda:0")

2024-06-25 16:26:48.564700: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-25 16:26:48.564764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-25 16:26:48.567038: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-25 16:26:48.579223: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# compute model size

params = sum(param.numel() for param in model.parameters())
print("model size: ", params/1e9, "GB")
print("total required memory: ", round(params/1e9 * (4 + 4 + 12), 2), "GB")

model size:  1.065314304 GB
total required memory:  21.31 GB


## III. lora

In [4]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(task_type=TaskType.CAUSAL_LM)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [5]:
peft_model = get_peft_model(model, config)
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 1536)
        (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=4608, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4608, bias=False)
                )
                (lora_embedding_A): Paramet

In [6]:
peft_model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 1,066,493,952 || trainable%: 0.1106


In [7]:
# define training arguments
args = TrainingArguments(
    output_dir="../tmp/checkpoint",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=50,
    num_train_epochs=3
)

# define trainer
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_data,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)

# train
trainer.train()

Step,Training Loss
50,2.5689
100,2.1946
150,2.0208
200,2.1283
250,1.9262
300,1.9684
350,1.964


TrainOutput(global_step=375, training_loss=2.0956065470377605, metrics={'train_runtime': 347.2099, 'train_samples_per_second': 8.64, 'train_steps_per_second': 1.08, 'total_flos': 1672953534259200.0, 'train_loss': 2.0956065470377605, 'epoch': 3.0})

In [12]:
def generate(_model, _tokenizer, instruction, input=None):

    prompt = "human: {}\n{}".format(instruction, input).strip() + "\n\nAssistant: "
    inputs = _tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(_model.device)

    generation_output = _model.generate(
        input_ids=input_ids,
        output_scores=True,
        max_new_tokens=256
    )
    for seq in generation_output:
        output = tokenizer.decode(seq, skip_special_tokens=True)
        print(output)

generate(peft_model, tokenizer, "List five steps for comparing two products.")

human: List five steps for comparing two products.
None

Assistant: 1. Identify the main differences between the two products. 2. Determine the main advantages and disadvantages of each product. 3. Determine the main advantages and disadvantages of each product. 4. Determine the main advantages and disadvantages of each product. 5. Determine the main advantages and disadvantages of each product.


## IV. Config Parameters

In [8]:
# to show all trainables layers in the model

for name, param in peft_model.named_parameters():
    print(name)

base_model.model.transformer.word_embeddings.weight
base_model.model.transformer.word_embeddings_layernorm.weight
base_model.model.transformer.word_embeddings_layernorm.bias
base_model.model.transformer.h.0.input_layernorm.weight
base_model.model.transformer.h.0.input_layernorm.bias
base_model.model.transformer.h.0.self_attention.query_key_value.base_layer.weight
base_model.model.transformer.h.0.self_attention.query_key_value.base_layer.bias
base_model.model.transformer.h.0.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.0.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.0.self_attention.dense.weight
base_model.model.transformer.h.0.self_attention.dense.bias
base_model.model.transformer.h.0.post_attention_layernorm.weight
base_model.model.transformer.h.0.post_attention_layernorm.bias
base_model.model.transformer.h.0.mlp.dense_h_to_4h.weight
base_model.model.transformer.h.0.mlp.dense_h_to_4h.bias
base_model.model.tra

In [10]:
## Layers to add lora

# use parameter "target_modules" to add lora weights to the layers

config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["query_key_value", "dense_4h_to_h"])
peft_model_var = get_peft_model(model, config)
print(peft_model_var.print_trainable_parameters())
peft_model_var

trainable params: 2,654,208 || all params: 1,067,968,512 || trainable%: 0.2485
None


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 1536)
        (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-23): 24 x BloomBlock(
            (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear(
                (base_layer): Linear(in_features=1536, out_features=4608, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4608, bias=False)
                )
                (lora_embedding_A): Paramet

In [5]:
## add other trainable layers besides lora

# here we enable the training for word_embeddings layers
# It can be seen that the number of trainable parameters increased

config = LoraConfig(task_type=TaskType.CAUSAL_LM, 
                    target_modules=["query_key_value", "dense_4h_to_h"], 
                    modules_to_save=["word_embeddings"])

peft_model_var = get_peft_model(model, config)
print(peft_model_var.print_trainable_parameters())

trainable params: 388,005,888 || all params: 1,453,320,192 || trainable%: 26.6979
None


## V. combine weights

Here we would like to integrate lora weights to the base model.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import PeftModel

In [None]:
# load trained models

ckp = "bigscience/bloomz-1b1"

model = AutoModelForCausalLM.from_pretrained(ckp)

tokenizer = AutoTokenizer.from_pretrained(ckp)


In [None]:
# load lora weights

peft_ckp = "./checkpoint/checkpoint-100" # changed to the wanted checkpoint path
peft_model = PeftModel.from_pretrained(model, model_id=peft_ckp)

In [9]:
# integrate weights
# the merged model becomes "BloomForCausalLM" instead of "PeftModelForCausalLM"

merged_model = peft_model.merge_and_unload()
merged_model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 1536)
    (word_embeddings_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-23): 24 x BloomBlock(
        (input_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear(in_features=1536, out_features=4608, bias=True)
          (dense): Linear(in_features=1536, out_features=1536, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear(in_features=1536, out_features=6144, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear(in_features=6144, out_features=1536, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  )
  (

In [None]:
# save merged model

merged_model.save_pretraind("./checkpoint/lora/merged_model")