### Load Pretrained Model
Load a pretrained Mamba Model that is compatible with Transformers Library

In [None]:
from modeling_mamba import MambaForCausalLM
from transformers import AutoTokenizer

model = MambaForCausalLM.from_pretrained('Q-bert/Mamba-130M')
tokenizer = AutoTokenizer.from_pretrained('Q-bert/Mamba-130M')

text = "Hi"

input_ids = tokenizer.encode(text, return_tensors="pt")

output = model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Hi, I'm looking for a new job. I've been working at a company for about a


In [None]:
print([(n, type(m)) for n, m in model.named_modules()])

[('', <class 'modeling_mamba.MambaForCausalLM'>), ('model', <class 'modeling_mamba.MambaModel'>), ('model.embedding', <class 'torch.nn.modules.sparse.Embedding'>), ('model.layers', <class 'torch.nn.modules.container.ModuleList'>), ('model.layers.0', <class 'modeling_mamba.MambaBlock'>), ('model.layers.0.in_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.conv1d', <class 'torch.nn.modules.conv.Conv1d'>), ('model.layers.0.x_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.dt_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.out_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.norm', <class 'modeling_mamba.MambaRMSNorm'>), ('model.layers.1', <class 'modeling_mamba.MambaBlock'>), ('model.layers.1.in_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.1.conv1d', <class 'torch.nn.modules.conv.Conv1d'>), ('model.layers.1.x_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.1.dt_proj', <class 'tor

In [None]:
print(type(model))

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print('plain',print_trainable_parameters(model))

<class 'modeling_mamba.MambaForCausalLM'>
trainable params: 129135360 || all params: 129135360 || trainable%: 100.0
plain None


### Add LoRA adapters
1. Identify a particular layer in the Mamba and add an LoRA layer there
2. At this time, is only layer to verify if the code works


In [None]:
# adapter-1
from peft import LoraConfig, TaskType, get_peft_model

target_modules=["model.layers.3.x_proj"]

modelA = MambaForCausalLM.from_pretrained('Q-bert/Mamba-130M')
config = LoraConfig(
target_modules = target_modules,
task_type="CAUSAL_LM")
m1 = get_peft_model(modelA, config)
m1.print_trainable_parameters()
m1.save_pretrained("./wts/ada-1")

  warn("The installed version of bitsandbytes was compiled without GPU support. "
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


trainable params: 12,928 || all params: 129,148,288 || trainable%: 0.010010198509174199


In [None]:
# adapter-2
from peft import LoraConfig, TaskType, get_peft_model

target_modules=["model.layers.2.x_proj"]
modelB = MambaForCausalLM.from_pretrained('Q-bert/Mamba-130M')

config = LoraConfig(
target_modules = target_modules,
task_type="CAUSAL_LM")
m2 = get_peft_model(modelB, config)
m2.print_trainable_parameters()
m2.save_pretrained("./wts/ada-2")

trainable params: 12,928 || all params: 129,148,288 || trainable%: 0.010010198509174199


### Push them to Hub
push the adapters to hub

In [None]:
m1.push_to_hub("exp-lora-ada-1", organization="mlsquare")
m2.push_to_hub("exp-lora-ada-2", organization="mlsquare")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
adapter_model.safetensors:   0%|                                                                                                                                  | 0.00/52.0k [00:00<?, ?B/s]TOKENIZERS_PARALLELISM=(true | false)
adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:01<00:00, 36.2kB/s]
adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:01<00:00, 34.7kB/s]


CommitInfo(commit_url='https://huggingface.co/mlsquare/exp-lora-ada-2/commit/799ebd76788d9da7a2d720cb4f5eda481eb336fe', commit_message='Upload model', commit_description='', oid='799ebd76788d9da7a2d720cb4f5eda481eb336fe', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
m1.push_to_hub(repo_id="mlsquare/test-1")
m2.push_to_hub(repo_id="mlsquare/test-2")

adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:01<00:00, 37.1kB/s]
adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:01<00:00, 38.6kB/s]


CommitInfo(commit_url='https://huggingface.co/mlsquare/test-2/commit/7511028150d4612a7a8290b596e83ca91aa55c0a', commit_message='Upload model', commit_description='', oid='7511028150d4612a7a8290b596e83ca91aa55c0a', pr_url=None, pr_revision=None, pr_num=None)

### Load adapters from the hub

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "Q-bert/Mamba-130M"
peft_model_id_1 = "mlsquare/exp-lora-ada-1"
peft_model_id_2 = "mlsquare/exp-lora-ada-2"

model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True)
print('base mamba',print_trainable_parameters(model))
model.load_adapter(peft_model_id_1, "ada-1")
print('with 1st adapter',print_trainable_parameters(model))
model.load_adapter(peft_model_id_2, "ada-2")
print('with 2nd adapter',print_trainable_parameters(model))

trainable params: 129135360 || all params: 129135360 || trainable%: 100.0
base mamba None


adapter_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 72.3kB/s]
adapter_model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:00<00:00, 678kB/s]


trainable params: 0 || all params: 129148288 || trainable%: 0.0
with 1st adapter None


adapter_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 591/591 [00:00<00:00, 218kB/s]
adapter_model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 52.0k/52.0k [00:00<00:00, 637kB/s]

trainable params: 0 || all params: 129161216 || trainable%: 0.0
with 2nd adapter None





### Merge the adpater into the Model
merge the adapter back to the model, so the merged model will have exactly the same architecture
except with the weights modified

In [None]:
# https://github.com/huggingface/peft
from peft import PeftMixedModel


model.set_adapter(["ada-1", "ada-2"])
print('base mamba',print_trainable_parameters(model))

output = model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

trainable params: 25856 || all params: 129161216 || trainable%: 0.02001839313745699
base mamba None
Hi, I'm looking for a new job. I've been working at a company for about a


In [None]:
#from peft import PeftMixedModel, PeftModel
#model_id = "Q-bert/Mamba-130M"
#base_model = AutoModelForCausalLM.from_pretrained(model_id,trust_remote_code=True)
#peft_model = PeftModel.from_pretrained(base_model, "saddlepoint/exp-lora-ada-1","ada-A")
#peft_model.merge_and_unload()
#print('base mamba',print_trainable_parameters(peft_model))
#peft_model.load_adapter("saddlepoint/exp-lora-ada-2", "ada-2")

In [None]:
a1 = MambaForCausalLM.from_pretrained('mlsquare/exp-lora-ada-1')
a2 = MambaForCausalLM.from_pretrained('mlsquare/exp-lora-ada-2')

In [None]:
a1.save_pretrained("./mbins/tmp/ada-1")
a2.save_pretrained("./mbins/tmp/ada-2")



In [None]:
from peft import PeftMixedModel


base_model = MambaForCausalLM.from_pretrained('Q-bert/Mamba-130M')
print('base mamba',print_trainable_parameters(base_model))
peft_model = PeftMixedModel.from_pretrained(base_model, "./mbins/tmp/ada-1" , "adapter1")
print('base mamba',print_trainable_parameters(base_model))
peft_model.load_adapter("./mbins/tmp/ada-2", "adapter2")
peft_model.set_adapter(["adapter1", "adapter2"])
print('base mamba',print_trainable_parameters(base_model))

output = peft_model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

trainable params: 129135360 || all params: 129135360 || trainable%: 100.0
base mamba None
trainable params: 12928 || all params: 129148288 || trainable%: 0.010010198509174199
base mamba None
trainable params: 25856 || all params: 129161216 || trainable%: 0.02001839313745699
base mamba None
Hi, I'm looking for a new job. I've been working at a company for about a


In [None]:
peft_model.merge_and_unload()
print('merged mamba',print_trainable_parameters(peft_model))
peft_model.base_model.save_pretrained("./mbins/fed-hf/")

trainable params: 0 || all params: 129135360 || trainable%: 0.0
merged mamba None


In [None]:
output = peft_model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Hi, I'm looking for a new job. I've been working at a company for about a
