In [1]:
import os

import torch
import transformers
from peft import PeftModel


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/lxe/miniconda3/envs/llama-lora/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /home/lxe/miniconda3/envs/llama-lora/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


In [2]:
BASE_MODEL = 'cerebras/Cerebras-GPT-2.7B'

tokenizer = transformers.AutoTokenizer.from_pretrained(BASE_MODEL)

base_model = transformers.AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},
)


In [3]:
# base_model.sd

In [4]:
first_weight = base_model.transformer.h[0].attn.c_attn.weight
first_weight_old = first_weight.clone()

lora_model = PeftModel.from_pretrained(
    base_model,
    "lora-cerebras-gpt2.7b-alpaca",
    device_map={"": "cpu"},
    torch_dtype=torch.float16,
)

In [5]:
lora_weight = lora_model.base_model.transformer.h[0].attn.c_attn.weight

assert torch.allclose(first_weight_old, first_weight)

# merge weights
for layer in lora_model.base_model.transformer.h:
    layer.attn.c_attn.merge_weights = True

lora_model.train(False)

# did we do anything?
assert not torch.allclose(first_weight_old, first_weight)

lora_model_sd = lora_model.state_dict()
deloreanized_sd = {
    k.replace("base_model.model.", ""): v
    for k, v in lora_model_sd.items()
    if "lora" not in k
}

base_model.save_pretrained("./hf_ckpt", state_dict=deloreanized_sd, max_shard_size="12GB")