In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from datasets import load_dataset, Dataset

dataset = load_dataset("ai4bharat/samanantar", "te")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")



In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'src', 'tgt'],
        num_rows: 4946035
    })
})

In [3]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset


def filter_streaming_dataset(dataset):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
#         if total < 6:
#             print(sample)
        if total< 1000:
            filtered_dict[str("tgt")].append(sample["tgt"])
        else:
            break

#         if total == 6:
#             break
    print(f"{len(filtered_dict.keys())/total:.2%} of data after filtering.")
#     print(filtered_dict)
    return Dataset.from_dict(filtered_dict)
#     return filtered_dict

In [4]:
from datasets import load_dataset
filtered_ds = filter_streaming_dataset(dataset["train"])

999it [00:00, 24598.80it/s]

0.10% of data after filtering.





In [6]:
# Determine the number of samples for train and validation sets
from datasets import Dataset, DatasetDict
train_size = int(len(filtered_ds) * 0.8)  # 80% for training
valid_size = len(filtered_ds) - train_size  # Remaining for validation

# Split the dataset into train and validation sets
ds_train = filtered_ds.select(list(range(train_size)))
ds_valid = filtered_ds.select(list(range(train_size, train_size + valid_size)))

# Create DatasetDict with train and validation sets
raw_datasets = DatasetDict({"train": ds_train, "valid": ds_valid})

In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['tgt'],
        num_rows: 799
    })
    valid: Dataset({
        features: ['tgt'],
        num_rows: 200
    })
})

In [55]:
def tokenize(element):
    outputs = tokenizer(
        element["tgt"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length ==128:
            input_batch.append(input_ids)
    print(len(outputs["input_ids"]))
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

799


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

200


DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 371
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 88
    })
})

In [12]:
cd src

/home/yashc/elephant/mamba-hf/src


In [63]:
# from modeling_mamba import MambaForCausalLM
# from transformers import AutoTokenizer
# model = MambaForCausalLM.from_pretrained('Q-bert/Mamba-130M')
# model.init_weights()
from configuration_mamba import MambaConfig
import torch
import torch.nn as nn
import torch.nn.functional as F
from modeling_mamba import MambaModel, MambaForCausalLM
# Assuming the MambaConfig class is defined as provided
# And assuming MambaModel and MambaForCausalLM classes are defined as per your initial code

# Step 1: Create a MambaConfig instance with desired configurations
config = MambaConfig(
    vocab_size=512,  # Adjust vocab size as needed
    d_model=256,  # Model dimension
    d_conv=4,  # Convolution kernel size
    expand=2,  # Expansion factor for inner dimension calculation
    conv_bias=True,  # Whether to use bias in convolution layers
    bias=False,  # Whether to use bias in other layers
    n_layer=4,  # Number of layers in the model
    dt_rank="auto",  # Automatically determine dt_rank or set a specific integer value
    pad_vocab_size_multiple=8,  # Ensure vocab size is a multiple of this value
    initializer_range=0.02  # Initializer range for model parameters
)


mamba_causal_lm = MambaForCausalLM(config)



In [64]:
from transformers import Trainer

class MambaTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
#         print(inputs, "HERE")
#         print(inputs)
        input_ids = inputs.pop("input_ids")
#         print("INPUT IDs", input_ids)
#         print(input_ids)
#         print(input_ids)
        lm_logits = model(input_ids)[0]
#         print(lm_logits)
        labels = input_ids.to(lm_logits.device)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        labels = labels[:, 1:].contiguous()
#         print("LABELS: ", labels)
#         tokenizer.decode(lm_logits)
        loss_fct = torch.nn.CrossEntropyLoss()
        lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1))
#         print(lm_loss)
        return lm_loss

In [65]:
from transformers import Trainer, TrainingArguments
torch.cuda.empty_cache()
args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    num_train_epochs=4,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = MambaTrainer(
    model=mamba_causal_lm,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

In [66]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=188, training_loss=4.27571495543135, metrics={'train_runtime': 71.678, 'train_samples_per_second': 20.704, 'train_steps_per_second': 2.623, 'total_flos': 1997140131840.0, 'train_loss': 4.27571495543135, 'epoch': 4.0})