### Load Pretrained Model
Load a pretrained Mamba Model that is compatible with Transformers Library

In [None]:
from modeling_mamba import MambaForCausalLM
from configuration_mamba import MambaConfig
from transformers import AutoTokenizer

config = MambaConfig(vocab_size=10,
        d_state=4,
        d_model=6,
        d_conv=4,
        expand=2,
        conv_bias=True,
        bias=False,
        n_layer=1)
model = MambaForCausalLM(config)
print(model.config)

  from .autonotebook import tqdm as notebook_tqdm


MambaConfig {
  "bias": false,
  "conv_bias": true,
  "d_conv": 4,
  "d_inner": 12,
  "d_model": 6,
  "d_state": 4,
  "dt_rank": 1,
  "expand": 2,
  "initializer_range": 0.02,
  "model_type": "mamba",
  "n_layer": 1,
  "pad_vocab_size_multiple": 8,
  "transformers_version": "4.37.1",
  "vocab_size": 16
}



In [None]:
tokenizer = AutoTokenizer.from_pretrained('Q-bert/Mamba-130M')
text = "Hi"
input_ids = tokenizer.encode(text, return_tensors="pt")
input_ids[[0]]=0
output = model.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


&&(((!!%%++%!..!


In [None]:
print(output)

tensor([[ 0,  0,  7,  7,  9,  9,  1,  1,  9,  2,  2,  6,  6, 12, 12,  6,  2, 15,
         15,  2]])


In [None]:
print([(n, type(m)) for n, m in model.named_modules()])

[('', <class 'modeling_mamba.MambaForCausalLM'>), ('model', <class 'modeling_mamba.MambaModel'>), ('model.embedding', <class 'torch.nn.modules.sparse.Embedding'>), ('model.layers', <class 'torch.nn.modules.container.ModuleList'>), ('model.layers.0', <class 'modeling_mamba.MambaBlock'>), ('model.layers.0.in_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.conv1d', <class 'torch.nn.modules.conv.Conv1d'>), ('model.layers.0.x_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.dt_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.out_proj', <class 'torch.nn.modules.linear.Linear'>), ('model.layers.0.norm', <class 'modeling_mamba.MambaRMSNorm'>), ('model.norm_f', <class 'modeling_mamba.MambaRMSNorm'>), ('lm_head', <class 'torch.nn.modules.linear.Linear'>)]


In [None]:
print(type(model))

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print('plain',print_trainable_parameters(model))

<class 'modeling_mamba.MambaForCausalLM'>
trainable params: 576 || all params: 576 || trainable%: 100.0
plain None


In [None]:
plist = model.state_dict().keys()
for p in plist:
    print(p)

model.embedding.weight
model.layers.0.A_log
model.layers.0.D
model.layers.0.in_proj.weight
model.layers.0.conv1d.weight
model.layers.0.conv1d.bias
model.layers.0.x_proj.weight
model.layers.0.dt_proj.weight
model.layers.0.dt_proj.bias
model.layers.0.out_proj.weight
model.layers.0.norm.weight
model.norm_f.weight
lm_head.weight


In [None]:
# set one tensor to zero
import torch
def zero_init(model):
    state_dict_before = model.state_dict()
    state_dict_after = state_dict_before
    for p in state_dict_before:
        wt = state_dict_before[p]
        state_dict_after[p] = torch.zeros_like(wt)
    model.load_state_dict(state_dict_after)
    return model

    
s = 'model.layers.0.in_proj.weight'
print('before',model.state_dict()[s])
model = zero_init(model) 
plist = model.state_dict().keys()
for p in plist:
    print(p)
print('after',model.state_dict()[s])

before tensor([[-6.9360e-03, -2.6066e-02, -1.5970e-02,  4.0102e-02, -5.7095e-03,
          1.0152e-02],
        [ 3.8860e-03, -2.1722e-02,  3.3430e-02,  2.1430e-02,  7.7757e-03,
         -6.2078e-03],
        [ 1.3417e-02, -6.0377e-03,  3.0089e-02,  8.0725e-03,  1.3436e-02,
         -5.6337e-04],
        [-4.7518e-04,  1.8928e-02,  1.5352e-02, -2.8361e-02, -3.0808e-03,
         -9.0114e-03],
        [ 1.4535e-02,  1.7960e-02,  2.4319e-03, -1.2897e-02, -2.4247e-02,
          3.8146e-03],
        [-2.8388e-02,  2.2076e-02, -1.9500e-02, -5.2615e-02, -9.7469e-03,
         -6.6771e-03],
        [-1.0930e-02,  5.9960e-03,  4.2091e-03, -1.9360e-02, -1.3724e-02,
         -1.9674e-02],
        [ 4.8153e-03, -8.1420e-03, -6.5606e-03,  2.0654e-02,  4.5150e-03,
          3.9812e-02],
        [-1.0122e-02, -3.5603e-03,  1.5604e-03,  1.6430e-02,  9.4992e-03,
          2.0851e-02],
        [-5.0829e-02, -1.2619e-02, -1.9031e-02, -2.3794e-02, -3.6052e-02,
          1.8947e-02],
        [-7.3191e-03,  

### Add LoRA adapters
1. Identify a particular layer in the Mamba and add an LoRA layer there
2. At this time, is only layer to verify if the code works


In [None]:
from peft import LoraConfig, TaskType


target_modules=["model.layers.0.x_proj"]

config = LoraConfig(
target_modules = target_modules,
task_type="CAUSAL_LM")

In [None]:
from peft import get_peft_model

model = get_peft_model(model, config)
model.print_trainable_parameters()

'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


trainable params: 168 || all params: 744 || trainable%: 22.580645161290324


In [None]:
model.save_pretrained("wts")



### Merge the adpater into the Model
merge the adapter back to the model, so the merged model will have exactly the same architecture
except with the weights modified

In [None]:
from peft import PeftConfig, PeftModel
adapter_path = "./wts/"
adapter_config = PeftConfig.from_pretrained(adapter_path)


config = MambaConfig(vocab_size=10,
        d_state=4,
        d_model=6,
        d_conv=4,
        expand=2,
        conv_bias=True,
        bias=False,
        n_layer=1)

model = MambaForCausalLM(config)
base_model = MambaForCausalLM(config)
#base_model = zero_init(base_model) 

adapted_model = PeftModel.from_pretrained(base_model, adapter_path)

In [None]:
m = adapted_model.merge_and_unload()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:

s = "model.layers.0.x_proj.weight"
print('before LoRA',base_model.state_dict()[s])

plist = m.state_dict().keys()
for p in plist:
    print(p)
print('after LoRA',m.state_dict()[s])

before LoRA tensor([[ 1.4849e-02, -1.2009e-02, -3.2689e-02, -2.5011e-02, -2.9925e-02,
         -6.8935e-03, -4.2453e-03,  3.2285e-03,  2.1370e-02,  3.2274e-02,
          2.5561e-02, -2.6172e-02],
        [ 2.5374e-02, -2.7696e-02,  7.7958e-03, -4.7651e-03,  1.6143e-02,
         -4.9373e-03,  6.4753e-02, -1.0391e-02,  4.0120e-02, -6.6447e-03,
         -3.2088e-02, -9.7386e-03],
        [-4.5022e-03,  1.0816e-02,  1.9747e-02,  2.6482e-03,  6.8737e-03,
         -1.4123e-02, -4.2332e-03, -2.2343e-02,  3.7523e-03, -6.2664e-03,
          1.6541e-02, -1.3695e-02],
        [-2.3554e-03,  4.3192e-02, -4.4122e-02,  9.9059e-03, -3.0129e-02,
         -5.0738e-03,  1.1388e-02, -3.4150e-02,  2.1487e-03, -6.1900e-03,
         -1.0104e-02, -1.1733e-03],
        [-8.2968e-03, -9.4233e-03, -1.0369e-02, -2.0514e-02, -1.0381e-02,
          6.2013e-03, -1.6298e-02,  4.8816e-03,  9.1194e-03, -1.4476e-02,
         -1.1100e-02, -2.4957e-02],
        [ 1.0809e-02, -2.5367e-02, -1.1991e-02, -2.2620e-03, -1.0676

In [None]:
text = "Hi"

input_ids = tokenizer.encode(text, return_tensors="pt")
input_ids[[0]] = 0

output = m.generate(input_ids, max_length=20, num_beams=5, no_repeat_ngram_size=2)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(generated_text)

)),,$$##((**""&&'&


In [None]:
print('base mamba',print_trainable_parameters(base_model))
print('lora mamba',print_trainable_parameters(model))
print('merged mamba',print_trainable_parameters(m))

trainable params: 0 || all params: 576 || trainable%: 0.0
base mamba None
trainable params: 576 || all params: 576 || trainable%: 100.0
lora mamba None
trainable params: 0 || all params: 576 || trainable%: 0.0
merged mamba None


In [None]:
m.save_pretrained("./mbins", from_pt=True)

In [None]:
import torch
torch.save(m, "./mbins/merged_mamba.pt")

In [None]:
torch.save(base_model, "./mbins/base_mamba.pt")

In [None]:
adapted_model.state_dict().keys()

odict_keys(['base_model.model.model.embedding.weight', 'base_model.model.model.layers.0.A_log', 'base_model.model.model.layers.0.D', 'base_model.model.model.layers.0.in_proj.weight', 'base_model.model.model.layers.0.conv1d.weight', 'base_model.model.model.layers.0.conv1d.bias', 'base_model.model.model.layers.0.x_proj.weight', 'base_model.model.model.layers.0.dt_proj.weight', 'base_model.model.model.layers.0.dt_proj.bias', 'base_model.model.model.layers.0.out_proj.weight', 'base_model.model.model.layers.0.norm.weight', 'base_model.model.model.norm_f.weight', 'base_model.model.lm_head.weight'])

In [None]:
adapter_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=8, target_modules={'model.layers.0.x_proj'}, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

In [None]:
for p in adapted_model.named_parameters():
    print(p)

('base_model.model.model.embedding.weight', Parameter containing:
tensor([[-1.5483e-02,  3.0647e-02,  3.3757e-03,  6.0129e-03,  5.4505e-03,
         -3.5295e-02],
        [ 1.5370e-02, -2.8087e-02, -6.7823e-03, -5.6044e-03, -1.5007e-04,
         -4.3077e-03],
        [ 1.4575e-02,  3.4753e-03,  1.3032e-02,  5.0821e-03, -1.3286e-02,
          7.1539e-03],
        [ 6.0257e-02, -9.3745e-03, -2.1507e-02,  1.9956e-02,  1.6579e-02,
          3.0674e-03],
        [-3.3371e-02, -9.4733e-03,  2.1177e-02, -1.1452e-02, -1.9490e-02,
          2.5770e-02],
        [-9.2815e-03,  3.1123e-02,  1.9363e-02, -5.0284e-03, -3.2319e-02,
         -2.0659e-03],
        [ 3.6841e-03, -1.2734e-02, -3.5888e-02,  8.8476e-03,  1.2994e-02,
         -1.3616e-02],
        [ 2.8832e-02, -5.4185e-03, -4.8067e-03,  4.3071e-03,  1.8272e-02,
         -4.6660e-02],
        [ 1.9783e-02, -2.1448e-02,  5.3043e-03, -1.1504e-02, -5.2285e-04,
         -2.7710e-02],
        [-3.3491e-04, -4.6580e-03,  2.9298e-02, -7.3879e-03, 