In [None]:
python -m venv mamba_env
source mamba_env/bin/activate
pip install packaging wheel torch ipykernel datasets
pip install accelerate -U
pip install .
python -m ipykernel install --user --name=mamba_env

In [13]:
import torch
from transformers import AutoTokenizer, GPTNeoXForCausalLM, GPTNeoXModel
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel
import sys
torch.set_default_device("cuda")


# Take in the model you want to train
model_name = "state-spaces/mamba-130m"

# Choose a tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

tokenizer_pythia = AutoTokenizer.from_pretrained("EleutherAI/pythia-160m-deduped")
tokenizer_pythia.eos_token = "<|endoftext|>"
tokenizer_pythia.pad_token = tokenizer_pythia.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer, GPTNeoXModel

pythia = GPTNeoXForCausalLM.from_pretrained(
  "EleutherAI/pythia-410m-deduped",
  #output_hidden_states=True,
  #revision="step3000",
  #cache_dir="./pythia-70m-deduped/step3000",
).to(torch.device('cuda:0'))


In [7]:
mamba = MambaLMHeadModel.from_pretrained(
    model_name, 
    device="cuda", 
    dtype=torch.float16)

In [21]:
# Take the user input from the command line
user_message = "Give me three steps to improve my diet, and include some evidence"#input("\n> ")

# Create a prompt
n_shot_prompting = [
    {
        "question": "What is the capital of France?",
        "answer": "Paris"
    },
    {
        "question": "Who invented the segway?",
        "answer": "Dean Kamen"
    },
    {
        "question": "What is the fastest animal?",
        "answer": "Cheetah"
    }
]

prompt = f"You are a Trivia QA bot.\nAnswer the following question succinctly and accurately."
prompt = f"{prompt}\n\n" + "\n\n".join([f"Q: {p['question']}\nA: {p['answer']}" for p in n_shot_prompting])
prompt = f"{prompt}\n\nQ: {user_message}\nA:"

# Debug print to make sure our prompt looks good
print(prompt)

# Encode the text to token IDs
input_ids = torch.LongTensor([tokenizer.encode(prompt)]).cuda()


You are a Trivia QA bot.
Answer the following question succinctly and accurately.

Q: What is the capital of France?
A: Paris

Q: Who invented the segway?
A: Dean Kamen

Q: What is the fastest animal?
A: Cheetah

Q: Give me three steps to improve my diet, and include some evidence
A:


In [61]:
# Generate an output sequence of tokens given the input
# "out" will contain the raw token ids as integers
out = model.generate(
    input_ids=input_ids,
    max_length=256,
    eos_token_id=tokenizer.eos_token_id
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [22]:
pythia_out = pythia(
    input_ids=input_ids,
    output_hidden_states=True
)

mamba_out = mamba(
    input_ids=input_ids,
    output_hidden_states=True
)

In [56]:
teacher_loss = (pythia_out.logits.softmax(dim=2)[:,:,:50280] - mamba_out.logits.softmax(dim=2)).norm(dim=2).mean()
teacher_loss

tensor(0.1681, device='cuda:0', grad_fn=<MeanBackward0>)

In [28]:
import torch.nn.functional as F
import math
mu = 0
std = math.sqrt(1.0/mamba_out.hidden_states[0].shape[-1])
size = (1, pythia_out.hidden_states[0].shape[-1], mamba_out.hidden_states[0].shape[-1])
W = torch.normal(0, std, size).to(torch.device('cuda:0'))


F.cosine_similarity(pythia_out.hidden_states[0]@ W,  mamba_out.hidden_states[0], dim=2)


tensor([[ 0.0186,  0.0419, -0.0037,  0.0281,  0.0461, -0.0590,  0.0489, -0.0530,
         -0.0304,  0.0350,  0.0191, -0.0156, -0.0284,  0.0294,  0.0189, -0.0458,
         -0.0320,  0.0206,  0.0187,  0.0046,  0.0350,  0.0191,  0.0191, -0.0379,
          0.0194,  0.0517,  0.0057, -0.0284, -0.0318,  0.0318, -0.0130, -0.0533,
          0.0191, -0.0530,  0.0194,  0.0325,  0.0191,  0.0191, -0.0379,  0.0194,
          0.0415,  0.0252, -0.0284, -0.0212,  0.0129, -0.0533,  0.0191, -0.0530,
          0.0194, -0.0304,  0.0034, -0.0035,  0.0191,  0.0191, -0.0379,  0.0194,
          0.0517,  0.0057, -0.0284, -0.0399,  0.0144, -0.0533,  0.0191, -0.0530,
          0.0194, -0.0056,  0.0004,  0.0316,  0.0191,  0.0191, -0.0379,  0.0194,
         -0.0012, -0.0983,  0.0201, -0.0334, -0.0376, -0.0298,  0.0402,  0.0212,
          0.0004,  0.0187,  0.0376,  0.0017, -0.0423,  0.0191, -0.0530,  0.0194]],
       device='cuda:0', grad_fn=<SumBackward1>)

In [58]:
for k,X in enumerate(pythia_out.hidden_states):
    print(k, X.shape)
for k,X in enumerate(mamba_out.hidden_states):
    print(k, X.shape)

0 torch.Size([1, 88, 768])
1 torch.Size([1, 88, 768])
2 torch.Size([1, 88, 768])
3 torch.Size([1, 88, 768])
4 torch.Size([1, 88, 768])
5 torch.Size([1, 88, 768])
6 torch.Size([1, 88, 768])
7 torch.Size([1, 88, 768])
8 torch.Size([1, 88, 768])
9 torch.Size([1, 88, 768])
10 torch.Size([1, 88, 768])
11 torch.Size([1, 88, 768])
12 torch.Size([1, 88, 768])
0 torch.Size([1, 88, 768])
1 torch.Size([1, 88, 768])
2 torch.Size([1, 88, 768])
3 torch.Size([1, 88, 768])
4 torch.Size([1, 88, 768])
5 torch.Size([1, 88, 768])
6 torch.Size([1, 88, 768])
7 torch.Size([1, 88, 768])
8 torch.Size([1, 88, 768])
9 torch.Size([1, 88, 768])
10 torch.Size([1, 88, 768])
11 torch.Size([1, 88, 768])
12 torch.Size([1, 88, 768])
13 torch.Size([1, 88, 768])
14 torch.Size([1, 88, 768])
15 torch.Size([1, 88, 768])
16 torch.Size([1, 88, 768])
17 torch.Size([1, 88, 768])
18 torch.Size([1, 88, 768])
19 torch.Size([1, 88, 768])
20 torch.Size([1, 88, 768])
21 torch.Size([1, 88, 768])
22 torch.Size([1, 88, 768])
23 torch.Siz

In [58]:
# you must use the tokenizer to decode them back into strings
decoded = tokenizer.batch_decode(out)[0]
print("="*80)
print(decoded)
# out returns the whole sequence plus the original
cleaned = decoded.replace(prompt, "")

# the model will just keep generating, so only grab the first one
# cleaned = cleaned.split("\n\n")[0]
print(cleaned)

You are a Trivia QA bot.
Answer the following question succinctly and accurately.

Q: What is the capital of France?
A: Paris

Q: Who invented the segway?
A: Dean Kamen

Q: What is the fastest animal?
A: Cheetah

Q: Give me three steps to improve my diet, and include some evidence
A: I'm not sure.

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in
 I'm not sure.

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in the world?
A: Minecraft

Q: What is the most popular game in th

In [1]:
from training.train_mamba_with_pythia import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def run(args):
        
    model = MambaLMHeadModel.from_pretrained(args.model, dtype=torch.bfloat16, device="cuda")

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.pad_token = tokenizer.eos_token

    data_module = SFTDataModule(
        tokenizer=tokenizer,
        data_path=args.data_path,
    )

    trainer = MambaTrainer(
        model=model,
        train_dataset=data_module.dataset,
        tokenizer=tokenizer,
        args=TrainingArguments(
            learning_rate=args.learning_rate,
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.batch_size,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            optim=args.optim,
            output_dir=args.output,
            save_total_limit=2,
            logging_steps=50,
            save_steps=500,
        ),
        data_collator=data_module.data_collator,
    )

    trainer.train()
    trainer.save_model(args.output)

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="state-spaces/mamba-130m")
parser.add_argument("--output", type=str, default="output")
parser.add_argument("--tokenizer", type=str, default="EleutherAI/gpt-neox-20b")
parser.add_argument("--learning_rate", type=float, default=5e-4)
parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
parser.add_argument("--optim", type=str, default="adamw_torch")
parser.add_argument("--data_path", type=str, default="squad")
parser.add_argument("--num_epochs", type=int, default=10)
args = parser.parse_args('')

In [4]:
run(args)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Got 0 examples, preprocess...
Tokenizing dataset...


100%|██████████| 87599/87599 [01:39<00:00, 876.14it/s] 


Step,Training Loss
50,4.1338


OutOfMemoryError: CUDA out of memory. Tried to allocate 220.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 197.00 MiB is free. Including non-PyTorch memory, this process has 21.78 GiB memory in use. Of the allocated memory 20.03 GiB is allocated by PyTorch, and 1.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [5]:
8

8