## (1) Load model

In [None]:
from model import Mamba, ModelArgs
from transformers import AutoTokenizer

# One of:
#     'state-spaces/mamba-2.8b-slimpj'
#     'state-spaces/mamba-2.8b'
#     'state-spaces/mamba-1.4b'
#     'state-spaces/mamba-790m'
#     'state-spaces/mamba-370m'
#     'state-spaces/mamba-130m'
pretrained_model_name = 'state-spaces/mamba-370m'

model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
print('plain',print_trainable_parameters(model))

trainable params: 371516416 || all params: 371516416 || trainable%: 100.0
plain None


## (2) Generate Text

In [None]:
import torch
import torch.nn.functional as F


def generate(model,
             tokenizer,
             prompt: str,
             n_tokens_to_gen: int = 50,
             sample: bool = True,
             top_k: int = 40):
    model.eval()
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids
    
    for token_n in range(n_tokens_to_gen):
        with torch.no_grad():
            indices_to_input = input_ids
            next_token_logits = model(indices_to_input)[:, -1]
        
        probs = F.softmax(next_token_logits, dim=-1)
        (batch, vocab_size) = probs.shape
        
        if top_k is not None:
            (values, indices) = torch.topk(probs, k=top_k)
            probs[probs < values[:, -1, None]] = 0
            probs = probs / probs.sum(axis=1, keepdims=True)
        
        if sample:
            next_indices = torch.multinomial(probs, num_samples=1)
        else:
            next_indices = torch.argmax(probs, dim=-1)[:, None]
        
        input_ids = torch.cat([input_ids, next_indices], dim=1)

    output_completions = [tokenizer.decode(output.tolist()) for output in input_ids][0]
    
    return output_completions

In [None]:
print([(n, type(m)) for n, m in model.named_modules()])

[('', <class 'model.Mamba'>), ('embedding', <class 'torch.nn.modules.sparse.Embedding'>), ('layers', <class 'torch.nn.modules.container.ModuleList'>), ('layers.0', <class 'model.ResidualBlock'>), ('layers.0.mixer', <class 'model.MambaBlock'>), ('layers.0.mixer.in_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.0.mixer.conv1d', <class 'torch.nn.modules.conv.Conv1d'>), ('layers.0.mixer.x_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.0.mixer.dt_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.0.mixer.out_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.0.norm', <class 'model.RMSNorm'>), ('layers.1', <class 'model.ResidualBlock'>), ('layers.1.mixer', <class 'model.MambaBlock'>), ('layers.1.mixer.in_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.1.mixer.conv1d', <class 'torch.nn.modules.conv.Conv1d'>), ('layers.1.mixer.x_proj', <class 'torch.nn.modules.linear.Linear'>), ('layers.1.mixer.dt_proj', <class 'torch.nn.modules.linear.Linear'

In [None]:
from peft import LoraConfig, TaskType


target_modules=["layers.3.mixer.x_proj"]

config = LoraConfig(
target_modules = target_modules,
task_type="CAUSAL_LM")

In [None]:
from peft import inject_adapter_in_model

lora_model = inject_adapter_in_model(config, model)


  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [None]:
print('plain',print_trainable_parameters(lora_model))

trainable params: 17152 || all params: 371533568 || trainable%: 0.004616541135793146
plain None


In [None]:
print(generate(lora_model, tokenizer, 'Mamba is the'))

2024-01-31 03:34:29.185079: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Mamba is the only non-Chinese state with a state police force, with the equivalent of a national police force in the UK.

Mamba has also made some very significant political progress in this regard:

Mamba has enacted laws mandating the creation


In [None]:
print(type(lora_model))

<class 'model.Mamba'>


In [None]:
from peft import get_peft_model_state_dict

peft_state_dict = get_peft_model_state_dict(lora_model)
print(peft_state_dict)

{'layers.3.mixer.x_proj.lora_A.weight': tensor([[-0.0032,  0.0019,  0.0140,  ..., -0.0199,  0.0010,  0.0003],
        [ 0.0139, -0.0139, -0.0160,  ..., -0.0196,  0.0200,  0.0203],
        [ 0.0144, -0.0075,  0.0207,  ..., -0.0170, -0.0111, -0.0078],
        ...,
        [ 0.0062,  0.0019,  0.0108,  ...,  0.0043, -0.0008, -0.0218],
        [-0.0195,  0.0128, -0.0065,  ..., -0.0089,  0.0001,  0.0003],
        [ 0.0138,  0.0171, -0.0142,  ...,  0.0109,  0.0140, -0.0115]]), 'layers.3.mixer.x_proj.lora_B.weight': tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0.,

In [None]:
lora_model.save_pretrained(lora_adapter, save_adapter=True, save_config=True)

model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(base_model).to(“cuda”), lora_adapter)

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained(merged_model)