In [6]:
from datasets import load_dataset, get_dataset_split_names


In [7]:
def load_huggingface_dataset(dataset_name,*args,**kwargs):
    dataset = load_dataset(dataset_name,**kwargs)
    return dataset

In [8]:
dataset = load_huggingface_dataset("mwitiderrick/swahili",split="train")


In [9]:
def generate_dataset_splits(dataset):
  # Split the dataset into train, test and val

  train_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
  test_val = train_dataset["test"].train_test_split(
      test_size=0.5, shuffle=True, seed=42
  )
  train_dataset = train_dataset["train"]
  test_dataset = test_val["test"]
  val_dataset = test_val["train"]
  return train_dataset,test_dataset,val_dataset

In [10]:
train_dataset, test_dataset, val_dataset = generate_dataset_splits(dataset)

In [11]:
import re
def remove_non_text_symbols(text):

  text = re.sub(r'[^\x00-\x7F]+', '', text)

  return text

In [12]:
def clean_dataset(dataset):

  # clean the dataset object
  dataset = dataset.map(lambda example: {"text": remove_non_text_symbols(example["text"])})
  dataset = dataset.filter(lambda example: len(example["text"]) > 0)
  return dataset

In [13]:
# train_dataset = clean_dataset(train_dataset)
test_dataset = clean_dataset(test_dataset)
val_dataset = clean_dataset(val_dataset)

In [14]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer

In [15]:
def tokenize(tokenizer, dataset):
  # tokenize the dataset
  dataset = dataset.map(lambda example: tokenizer(example["text"],padding=True,max_length=256))
  return dataset


In [16]:
class KiswahiliSilabiTokenizer(PreTrainedTokenizerFast):
    def __init__(self, tokenizer,unk_token="[UNK]",sos_token="[SOS]",eos_token="[EOS]",space_token="[SPACE]",pad_token="[PAD]", **kwargs):
        super().__init__(tokenizer_object=tokenizer, **kwargs)
        self._vocab = tokenizer.get_vocab()
        self.unk_token = unk_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.space_token = space_token
        self.pad_token = pad_token

      # Add special tokens to vocab if they are not already present
        if self.sos_token not in self._vocab:
            self._vocab[self.sos_token] = len(self._vocab)
        if self.eos_token not in self._vocab:
            self._vocab[self.eos_token] = len(self._vocab)
        if self.unk_token not in self._vocab:
            self._vocab[self.unk_token] = len(self._vocab)
        if self.space_token not in self._vocab:
            self._vocab[self.space_token] = len(self._vocab)
        if self.pad_token not in self._vocab:
            self._vocab[self.pad_token] = len(self._vocab)

    def __call__(self, text,**kwargs):
        ids = self.convert_tokens_to_ids(self.tokenize(text,**kwargs))

        return {"input_ids": ids}

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        tokenizer = Tokenizer.from_file(f"{pretrained_model_name_or_path}/tokenizer.json")
        return cls(tokenizer, **kwargs)

    def _encode_with_byte_fallback(self, text):
        tokens = []
        i = 0
        while i < len(text):
            matched = False
            # Try to match the longest syllable first
            for j in range(len(text), i, -1):
                syllable_candidate = text[i:j]
                if syllable_candidate in self._vocab:
                    tokens.append(syllable_candidate)
                    i = j
                    matched = True
                    break
            # If no syllable matched, fallback to byte encoding
            if not matched:
                if text[i] == " ":
                  tokens.append(self.space_token)
                  i += 1
                else:
                  tokens.extend(self.unk_token)
                  i += 1
        return tokens

    def tokenize(self, text,**kwargs):
        handle_whitespace = kwargs.get("handle_whitespace", True)
        tokens = [self.sos_token]  # Start of sentence token
        for word in text.split(" "):
            tokens.extend(self._encode_with_byte_fallback(word))
            if handle_whitespace:
              tokens.extend(self._encode_with_byte_fallback(" "))
        tokens.append(self.eos_token)  # End of sentence token

        padding = kwargs.get("padding", False)
        if padding:
            max_length = kwargs.get("max_length", None)
            if max_length is not None:
                tokens = tokens[:max_length]
                tokens.extend([self.pad_token] * (max_length - len(tokens)))
            else:
                raise ValueError("max_length must be specified if padding is True")
        return tokens

    def tokens_to_sentence(self,tokens):
      for token in tokens:
        token = token.replace(" ", "")
      sentence = "".join(tokens)
      sentence = sentence.replace(self.eos_token, "")
      sentence = sentence.replace(self.sos_token, "")
      sentence = sentence.replace(self.space_token," ")
      return sentence

In [17]:
silabi_tokenizer = KiswahiliSilabiTokenizer.from_pretrained("./silabi_tokenizer")

In [18]:
# train_tokenized_dataset = tokenize(silabi_tokenizer, train_dataset)
test_tokenized_dataset = tokenize(silabi_tokenizer, test_dataset)
# val_tokenized_dataset = tokenize(silabi_tokenizer, val_dataset)

In [86]:
val_tokenized_dataset = tokenize(silabi_tokenizer, val_dataset)

Map:   0%|          | 0/576513 [00:00<?, ? examples/s]

In [19]:
import torch
from torch import nn
import numpy as np

In [20]:
from transformers.models.mamba.modeling_mamba import MambaBlock as Mamba
from linear_attention_transformer import LinearAttentionTransformer
from transformers.models.mamba import MambaConfig as MambaCfg


In [21]:
# Transformations

test_value = test_tokenized_dataset[0]['input_ids']

In [22]:
np_array = np.array(test_value)
input_tensor = torch.from_numpy(np_array).to(dtype=torch.long)
input_tensor = input_tensor.unsqueeze(0)  # Shape becomes (1, sequence_length)


In [23]:
mamba_config = MambaCfg(vocab_size=silabi_tokenizer.vocab_size,hidden_size=512,num_heads=16)
# hidden_size = dimension
mamba_block = Mamba(
    config=mamba_config, layer_idx=0
)

The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.


In [24]:
mamba_embedding_dim = 512  # embedding dimension must match linformer dim
vocab_size = silabi_tokenizer.vocab_size
mamba_embedding_layer = torch.nn.Embedding(num_embeddings=silabi_tokenizer.vocab_size, embedding_dim=mamba_embedding_dim)
mamba_embedded_tensor = mamba_embedding_layer(input_tensor)

In [25]:
mamba_embedded_tensor.shape

torch.Size([1, 256, 512])

In [26]:
mamba_block(mamba_embedded_tensor)

tensor([[[-0.6630,  1.7532,  1.1520,  ..., -0.4268, -0.5768,  0.8503],
         [ 0.2792, -0.2170,  0.4621,  ..., -0.9457, -0.3212,  0.6602],
         [ 0.7157,  1.0502,  0.3434,  ...,  0.5062,  0.1723, -1.7375],
         ...,
         [-0.4800, -0.0839, -0.0720,  ...,  1.1306,  0.5015, -0.5833],
         [-0.4800, -0.0839, -0.0720,  ...,  1.1306,  0.5015, -0.5833],
         [-0.4800, -0.0839, -0.0720,  ...,  1.1306,  0.5015, -0.5833]]],
       grad_fn=<AddBackward0>)

In [27]:
# (batch, sequence_length {256}) -> [embedding] -> (batch, sequence_length {256}. dimension {128}) -> [linformer] -> (batch, sequence_length {256}. dimension {128})

In [28]:
linformer = LinearAttentionTransformer(
    dim = 256,
    heads = 8,
    depth = 1,
    max_seq_len = 256,
    n_local_attn_heads = 4
)

In [29]:
embedding_dim = 256  # embedding dimension must match linformer dim
vocab_size = silabi_tokenizer.vocab_size
embedding_layer = torch.nn.Embedding(num_embeddings=silabi_tokenizer.vocab_size, embedding_dim=embedding_dim)
embedded_tensor = embedding_layer(input_tensor)  # Shape: (batch_size, sequence_length, embedding_dim)


In [30]:
embedded_tensor.shape

torch.Size([1, 256, 256])

In [31]:
linformer(embedded_tensor)

tensor([[[-1.4879,  2.1113,  1.7677,  ...,  0.9438, -2.0722,  0.2778],
         [ 0.2190, -0.1404, -2.9204,  ...,  0.4524, -0.3791,  1.1368],
         [ 0.0556,  0.0164, -0.3579,  ..., -0.1721, -0.0367,  1.7199],
         ...,
         [-0.7167,  1.1159, -1.6330,  ...,  0.3845, -1.8408, -2.7420],
         [-0.7167,  1.1159, -1.6330,  ...,  0.3845, -1.8408, -2.7420],
         [-0.7167,  1.1159, -1.6330,  ...,  0.3845, -1.8408, -2.7420]]],
       grad_fn=<AddBackward0>)

In [32]:
from dataclasses import dataclass

@dataclass
class LinformerConfig:
    dim: int
    heads: int
    depth: int
    max_seq_len: int
    n_local_attn_heads: int

    def to_dict(self):
        return {
            "dim": self.dim,
            "heads": self.heads,
            "max_seq_len": self.max_seq_len,
            "n_local_attn_heads": self.n_local_attn_heads,
            "depth": self.depth
        }
@dataclass
class MambaConfig:
    hidden_size: int # this is also the dimension
    num_heads: int 


    @property
    def dim(self):
        return self.hidden_size

    def mamba2_config(self,vocab_size):
        return MambaCfg(vocab_size=vocab_size,hidden_size=self.hidden_size,num_heads=self.num_heads)

        


In [33]:
class MambaBlock(nn.Module):
    def __init__(self,vocab_size, config: MambaConfig,layer_idx=0):
        super().__init__()
        self.vocab_size = vocab_size
        #self.embedding = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=config.dim)
        self.normalization = nn.modules.normalization.RMSNorm(config.dim)
        self.mamba2_block = Mamba(config = config.mamba2_config(self.vocab_size),layer_idx=layer_idx)


    def forward(self,x):
        #x = self.embedding(x)
        x = self.normalization(x)
        mamba_output = self.mamba2_block(x)
        x = x + mamba_output
        
        x = self.normalization(x)
        
        
        
        
        return x
        

In [34]:
class LinformerBlock(nn.Module):
    def __init__(self,vocab_size,config:LinformerConfig):
        super().__init__()
        #self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=config.dim)
        self.normalization = nn.modules.normalization.RMSNorm(config.dim)
        self.linformer = LinearAttentionTransformer(
            **config.to_dict()
        )

    def forward(self,x):
        #x = self.embedding(x)
        x = self.normalization(x)
        output = self.linformer(x)
        x = output + x

        x = self.normalization(x)

        return x

In [35]:
class LimbaBlock(nn.Module):
    def __init__(self,linformer_config,mamba_config,layer_idx=0,vocab_size=1000,dropout = 0.1):
        super().__init__()

        self.mamba_block = MambaBlock(vocab_size,mamba_config,layer_idx=layer_idx)
        self.linformer_block = LinformerBlock(vocab_size,linformer_config)
        
        self.linformer_mamba_reshape = nn.Linear(linformer_config.dim, mamba_config.dim)
        self.mamba_linformer_reshape = nn.Linear(mamba_config.dim, linformer_config.dim)
        self.dropout = nn.Dropout(dropout)



    def forward(self,x):
        # x -> (batch,seq_len)
        x = self.mamba_block(x)

        x = self.mamba_linformer_reshape(x)

        x = self.dropout(x)
        
        x = self.linformer_block(x)

        x = self.dropout(x)

        x = self.linformer_mamba_reshape(x)

        


        return x
        
        

        

In [36]:
linformer_config = LinformerConfig(dim = 256,
    heads = 8,
    depth = 1,
    max_seq_len = 256,
    n_local_attn_heads = 4)
mamba_config = MambaConfig(
    hidden_size = 512,
    num_heads = 16
)


In [37]:

class Limba(nn.Module):
    def __init__(self, linformer_config, mamba_config, vocab_size, num_layers=6,dropout=0.1):
        super(Limba, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=mamba_config.dim)
        self.layers = nn.ModuleList(
            [
                LimbaBlock(linformer_config, mamba_config, vocab_size) for _ in range(num_layers)
            ]
        )

        self.layer_norm = nn.LayerNorm(mamba_config.dim)
        self.dropout = nn.Dropout(dropout)
        

        self.output_layer = nn.Linear(mamba_config.dim, vocab_size)

    
    def forward(self, x):
        # Pass input through embedding layer
        x = self.embedding(x)
        
        for layer in self.layers:
            x = layer(x)

        
        x = self.layer_norm(x)
        logits = self.output_layer(x)

        return logits

        

        

        

In [38]:
model = Limba(linformer_config,mamba_config, silabi_tokenizer.vocab_size)

In [39]:
print(model(input_tensor))

tensor([[[ 0.0928, -0.1131,  0.2505,  ...,  0.6304, -0.7602,  0.4108],
         [-0.1265,  0.2714, -0.9697,  ...,  0.3207,  0.4266,  0.1870],
         [-0.2593, -0.7594,  0.5382,  ..., -0.1908,  0.2530, -0.6654],
         ...,
         [-0.5266, -0.4568, -0.0689,  ...,  1.3795, -0.2306,  0.8054],
         [-0.5478, -0.1174,  0.5729,  ...,  1.2178, -0.1083,  0.1065],
         [ 0.1767, -0.7181, -0.2846,  ...,  0.5680, -0.7983,  0.9244]]],
       grad_fn=<ViewBackward0>)


In [40]:
from torchinfo import summary
summary(model, input_size=(1, 256),dtypes=[torch.long],device="mps")

Layer (type:depth-idx)                                                      Output Shape              Param #
Limba                                                                       [1, 256, 662]             --
├─Embedding: 1-1                                                            [1, 256, 512]             338,944
├─ModuleList: 1-2                                                           --                        --
│    └─LimbaBlock: 2-1                                                      [1, 256, 512]             --
│    │    └─MambaBlock: 3-1                                                 [1, 256, 512]             1,695,744
│    │    └─Linear: 3-2                                                     [1, 256, 256]             131,328
│    │    └─Dropout: 3-3                                                    [1, 256, 256]             --
│    │    └─LinformerBlock: 3-4                                             [1, 256, 256]             789,248
│    │    └─Dropout: 3-5    

In [52]:
test_tokenized_dataset[1000]['input_ids']

[5,
 269,
 482,
 395,
 366,
 7,
 511,
 7,
 395,
 379,
 430,
 7,
 511,
 7,
 77,
 307,
 388,
 10,
 7,
 500,
 488,
 487,
 500,
 357,
 379,
 379,
 475,
 10,
 7,
 381,
 511,
 7,
 175,
 426,
 7,
 522,
 457,
 52,
 7,
 6,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0

In [54]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TokenizedDataset(Dataset):
    def __init__(self, tokenized_dataset):
        self.tokenized_dataset = tokenized_dataset
        self.tokenized_dataset.set_format(type="torch", columns=["input_ids"])
        self.pad_token_id = silabi_tokenizer.pad_token_id
    def __len__(self):
        return len(self.tokenized_dataset)
    def __getitem__(self, idx):
        return self.tokenized_dataset[idx]['input_ids']


In [56]:
#train_tokenized = TokenizedDataset(train_tokenized_dataset)
test_tokenized = TokenizedDataset(test_tokenized_dataset)
#val_tokenized = TokenizedDataset(val_tokenized_dataset)

In [84]:
val_tokenized = TokenizedDataset(val_tokenized_dataset)

NameError: name 'val_tokenized_dataset' is not defined

In [58]:
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm  # Optional, for progress bar

def train_model(model, train_loader, num_epochs=10, learning_rate=1e-4, device='cuda'):
    # Move model to the specified device (GPU or CPU)
    model.to(device)
    
    # Initialize the optimizer
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Initialize the loss function
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding token (assumed to be 0)

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0

        # Loop through the training data
        for batch_idx, input_ids in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
            input_ids = input_ids.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass: get model output
            output = model(input_ids)

            # The output shape should be [batch_size, seq_len, vocab_size]
            # For language modeling, the target is the input shifted by 1 position
            target = input_ids[:, 1:].contiguous()  # Shift input for the target

            # Pad the target to the same length as the output (if needed)
            if target.size(1) < output.size(1):
                target = F.pad(target, (0, output.size(1) - target.size(1)), value=0)

            # Flatten the output and target tensors for CrossEntropyLoss
            output = output.view(-1, output.size(-1))  # Flatten the output tensor
            target = target.view(-1)  # Flatten the target tensor

            # Compute the loss
            loss = criterion(output, target)

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()

            # Track loss
            running_loss += loss.item()

        # Print the statistics for the current epoch
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")



In [44]:
mps_device = torch.device("mps")


In [45]:
mps_device

device(type='mps')

In [60]:
test_loader = DataLoader(test_tokenized,batch_size=8)

In [None]:
# Example usage:
# Assuming train_loader and val_loader are your data loaders
train_model(model, test_loader,num_epochs=3, device="mps")

## GPT-2 Evaluation

In [64]:
def tokenize_function(examples):
    return silabi_tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

In [66]:
tokenized_dataset = test_dataset.map(tokenize_function)

In [88]:
validation_tokenized_dataset = val_dataset.map(tokenize_function)

In [90]:
validation_tokenized_dataset.set_format(type="torch", columns=["input_ids"])


In [70]:
tokenized_dataset.set_format(type="torch", columns=["input_ids"])


In [72]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=silabi_tokenizer,
    mlm=False,  # GPT-2 is not trained with masked language modeling
)


In [106]:
from transformers import GPT2Model,GPT2Config, GPT2LMHeadModel

In [107]:
config = GPT2Config(
    vocab_size=silabi_tokenizer.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    bos_token_id=silabi_tokenizer.bos_token_id,
    eos_token_id=silabi_tokenizer.eos_token_id,
)


In [108]:
gpt2_model = GPT2LMHeadModel(config)

In [127]:
# gpt2_model(test_tokenized[0])
summary(gpt2_model, test_tokenized[0].shape,dtypes=[torch.long],device="mps")

Layer (type:depth-idx)                             Output Shape              Param #
GPT2LMHeadModel                                    [1, 12, 256, 64]          --
├─GPT2Model: 1-1                                   [1, 12, 256, 64]          --
│    └─Embedding: 2-1                              [1, 256, 768]             508,416
│    └─Embedding: 2-2                              [1, 256, 768]             786,432
│    └─Dropout: 2-3                                [1, 256, 768]             --
│    └─ModuleList: 2-4                             --                        --
│    │    └─GPT2Block: 3-1                         [1, 256, 768]             7,087,872
│    │    └─GPT2Block: 3-2                         [1, 256, 768]             7,087,872
│    │    └─GPT2Block: 3-3                         [1, 256, 768]             7,087,872
│    │    └─GPT2Block: 3-4                         [1, 256, 768]             7,087,872
│    │    └─GPT2Block: 3-5                         [1, 256, 768]             

In [119]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_model",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=test_tokenized,
    eval_dataset=val_tokenized_dataset,
    processing_class=silabi_tokenizer,
    data_collator=data_collator,
)


In [121]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 