In [1]:
import os
import logging
import sys
from contextlib import contextmanager
os.environ["CUDA_DEVICE_ORDER"]='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '5'

In [2]:
@contextmanager
def suppress_output():
    with open(os.devnull, 'w') as fnull:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = fnull
        sys.stderr = fnull
        try:
            yield
        finally:
            sys.stdout = old_stdout
            sys.stderr = old_stderr

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("training.log"),  
    ]
)
logger = logging.getLogger(__name__)

In [4]:
import torch
import argparse
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM
from mamba_trainer.data import DataModule
from mamba_trainer.data import LongRangeDataset
from mamba_trainer.trainer import MambaTrainer, GradientCallback
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
2024-08-06 19:03:23.054723: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 19:03:23.073575: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 19:03:23.079394: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-06 19:03:23.094536: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
model = AutoModelForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")
tokenizer.eos_token = "<|endoftext|>"
tokenizer.pad_token = tokenizer.eos_token

optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

lora_config = LoraConfig(
    r=16,
    target_modules=["x_proj", "embeddings", "in_proj", "out_proj"],
    task_type="CAUSAL_LM",
    bias="none"
)

model = get_peft_model(model, lora_config)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MambaForCausalLM(
      (backbone): MambaModel(
        (embeddings): lora.Embedding(
          (base_layer): Embedding(50280, 768)
          (lora_dropout): ModuleDict(
            (default): Identity()
          )
          (lora_A): ModuleDict()
          (lora_B): ModuleDict()
          (lora_embedding_A): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 16x50280 (cuda:0)])
          (lora_embedding_B): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 768x16 (cuda:0)])
          (lora_magnitude_vector): ModuleDict()
        )
        (layers): ModuleList(
          (0-23): 24 x MambaBlock(
            (norm): MambaRMSNorm(768, eps=1e-05)
            (mixer): MambaMixer(
              (conv1d): Conv1d(1536, 1536, kernel_size=(4,), stride=(1,), padding=(3,), groups=1536)
              (act): SiLU()
              (in_proj): lora.Linear(
                (base_la

In [7]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(model)



trainable params: 3,796,608 || all params: 132,931,968 || trainable%: 2.86


In [8]:
training_args = TrainingArguments(
    learning_rate=5e-5,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    output_dir="model",
    logging_dir="logs", 
    evaluation_strategy="epoch",  
    eval_steps=1, 
    logging_steps=1,
    save_steps=1,
    report_to="none", 
    disable_tqdm=True
)



In [9]:
from torch.utils.data import Subset
import numpy as np
grad_callback = GradientCallback()
data_module = DataModule(data_path="./data/basic_20-70/train.tsv", tokenizer=tokenizer)
dataset = data_module.dataset
val_data_module = DataModule(data_path="./data/basic_20-70/val.tsv", tokenizer=tokenizer)
val_dataset = val_data_module.dataset

for i in tqdm(range(1000), disable=False):
    np.random.seed(None)
    ids = np.random.choice(len(dataset), size=4, replace=False)
    subset = Subset(dataset, ids.tolist())
    
    with suppress_output():
        trainer = MambaTrainer(
            model=model,
            args=training_args,
            train_dataset=subset,
            tokenizer=tokenizer,
            eval_dataset=val_dataset,
            optimizers=(optimizer, None),
            data_collator=data_module.data_collator,
            callbacks=[grad_callback]
        )

        trainer.train()

./data/basic_20-70/train.tsv


I0000 00:00:1722960210.611583 1170864 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722960210.616777 1170864 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722960210.618894 1170864 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1722960210.626883 1170864 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

./data/basic_20-70/val.tsv


2024-08-06 19:03:32.678017: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
  6%|▋         | 64/1000 [13:17<3:14:23, 12.46s/it]


KeyboardInterrupt: 