In [15]:
from lm_eval import simple_evaluate
from lm_eval.utils import make_table

In [None]:
import torch
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers import AutoTokenizer

# 1. Define model and tokenizer IDs
model_id = "state-spaces/mamba2-1.3b"
tokenizer_id = "EleutherAI/gpt-neox-20b" # Standard tokenizer for Mamba models

# 2. Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
# Mamba does not use a BOS token, but EOS is usually needed
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

# 3. Load the Model using the official mamba-ssm library
# Note: 'device="cuda"' is usually required as Mamba kernels are GPU-only
print(f"Loading {model_id}...")
model = MambaLMHeadModel.from_pretrained(
    model_id, 
    device="cuda", 
    dtype=torch.bfloat16
)

# 4. Run Inference
text = "The theory of state space models is"
input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

# Generate text (MambaLMHeadModel has a simple generate method)
out = model.generate(
    input_ids=input_ids, 
    max_length=50, 
    temperature=0.9, 
    top_p=0.95, 
    eos_token_id=tokenizer.eos_token_id
)

decoded = tokenizer.decode(out[0])
print("\nOutput:", decoded)

  from .autonotebook import tqdm as notebook_tqdm


Loading state-spaces/mamba2-1.3b...

Output: The theory of state space models is a powerful tool for the analysis of complex systems. The theory is based on the assumption that the system under study is a linear time-invariant system with a state space representation. The state space representation is a set of


In [4]:
import torch
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
from transformers import AutoTokenizer

# 1. Define model and tokenizer IDs
model_id = "state-spaces/mamba2-1.3b"
tokenizer_id = "EleutherAI/gpt-neox-20b" # Standard tokenizer for Mamba models

# 2. Load the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
# Mamba does not use a BOS token, but EOS is usually needed
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

# 3. Load the Model using the official mamba-ssm library
# Note: 'device="cuda"' is usually required as Mamba kernels are GPU-only
print(f"Loading {model_id}...")
model = MambaLMHeadModel.from_pretrained(
    model_id, 
    device="cuda", 
    dtype=torch.bfloat16
)

# 4. Run Inference
text = "The theory of state space models is"
input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

# Generate text (MambaLMHeadModel has a simple generate method)
out = model.generate(
    input_ids=input_ids, 
    max_length=input_ids.shape[1] + 50,  
    temperature=1.0,      
    eos_token_id=tokenizer.eos_token_id
)

decoded = tokenizer.decode(out[0])
print("\nOutput:", decoded)

Loading state-spaces/mamba2-1.3b...

Output: The theory of state space models is a powerful tool for the analysis of complex systems. The theory is based on the assumption that the system under study is a linear time-invariant system with a state space representation. The state space representation is a set of state variables and a set of state


In [5]:
# Quick generation test
text = "The capital of France is"
input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

print(f"Input: {text}")
out = model.generate(
    input_ids=input_ids, 
    max_length=30, 
    temperature=0.7, 
    top_p=0.9
)
print(f"Output: {tokenizer.decode(out[0])}")

Input: The capital of France is
Output: The capital of France is a city of contrasts. It is a city of the past, a city of the present, and a city of the future


In [6]:
# Simple NIAH test
needle = "The secret code is 7392."
haystack = "The weather is nice today. " * 50
prompt = f"{haystack} {needle} {haystack} What is the secret code? The code is"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
print(f"üìè Input length: {input_ids.shape[1]} tokens")

out = model.generate(input_ids=input_ids, max_length=input_ids.shape[1]+10, temperature=0.1)
response = tokenizer.decode(out[0][input_ids.shape[1]:])
print(f"üéØ Response: {response}")

üìè Input length: 618 tokens
üéØ Response:  7392. The weather is nice today. The


In [8]:
# Patch mamba-ssm model for HFLM compatibility
model.device = next(model.parameters()).device

class MambaConfig:
    vocab_size = 50277
    hidden_size = 2048
    num_hidden_layers = 48
    tie_embeddings = True

model.config = MambaConfig()
model.tie_weights = lambda: None

# Now wrap with the custom class
print("üîå Wrapping Mamba-2 with custom HF-compatible wrapper...")
lm_obj = MambaHFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
    max_length=131072,
    trust_remote_code=True
)

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


üîå Wrapping Mamba-2 with custom HF-compatible wrapper...


In [9]:
# Custom wrapper to make mamba-ssm compatible with lm-eval
from lm_eval.models.huggingface import HFLM

class MambaHFLM(HFLM):
    """Custom wrapper for mamba-ssm models"""
    
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
        # Remove kwargs that mamba-ssm doesn't support
        generation_kwargs.pop('stopping_criteria', None)
        generation_kwargs.pop('pad_token_id', None)
        generation_kwargs.pop('use_cache', None)
        
        # Mamba uses max_length differently - it's total length, not new tokens
        with torch.no_grad():
            return self.model.generate(
                input_ids=context,
                max_length=max_length,
                temperature=generation_kwargs.get('temperature', 1.0),
                top_p=generation_kwargs.get('top_p', 1.0),
                eos_token_id=self.tokenizer.eos_token_id,
            )

# Now wrap with the custom class
print("üîå Wrapping Mamba-2 with custom HF-compatible wrapper...")
lm_obj = MambaHFLM(
    pretrained=model,
    tokenizer=tokenizer,
    batch_size=1,
    max_length=131072,
    trust_remote_code=True
)
print(f"‚úÖ Harness max_length: {lm_obj.max_length}")

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


üîå Wrapping Mamba-2 with custom HF-compatible wrapper...
‚úÖ Harness max_length: 131072


In [None]:
TASK = "niah_single_2"
LENGTHS = [1024, 2048, 4096, 8192]

results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK],
    device="cuda",
    num_fewshot=0,
    metadata={"max_seq_lengths": LENGTHS, "tokenizer": tokenizer_id}
)
print(make_table(results))

niah_single_2: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.
Generating synthetic samples: essay | 1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:00<00:00, 913.94it/s]
Generating synthetic samples: essay | 2048: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:02<00:00, 229.54it/s]
Generating synthetic samples: essay | 4096: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:03<00:00, 147.73it/s]
Generating synthetic samples: essay | 8192: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 500/500 [00:06<00:00, 78.96it/s]
Overwriting default num_fews

In [None]:
TASK = "niah_single_1"
LENGTHS = [1024, 2048, 4096, 8192]

results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK],
    device="cuda",
    num_fewshot=0,
    metadata={"max_seq_lengths": LENGTHS, "tokenizer": tokenizer_id}
)
print(make_table(results))

In [None]:
TASK = "niah_single_3"
LENGTHS = [1024, 2048, 4096, 8192]

results = simple_evaluate(
    model=lm_obj,
    tasks=[TASK],
    device="cuda",
    num_fewshot=0,
    metadata={"max_seq_lengths": LENGTHS, "tokenizer": tokenizer_id}
)
print(make_table(results))

Collecting wonderwords
  Downloading wonderwords-3.0.1-py3-none-any.whl.metadata (11 kB)
Downloading wonderwords-3.0.1-py3-none-any.whl (51 kB)
Installing collected packages: wonderwords
Successfully installed wonderwords-3.0.1


[nltk_data] Downloading package punkt to /home/louis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True