In [3]:
!pip install datasets



In [4]:
from datasets import load_dataset, get_dataset_split_names


In [5]:
def load_huggingface_dataset(dataset_name,*args,**kwargs):
    dataset = load_dataset(dataset_name,**kwargs)
    return dataset

In [6]:
dataset = load_huggingface_dataset("mwitiderrick/swahili",split="train",keep_in_memory=True)


In [7]:
def generate_dataset_splits(dataset):
  # Split the dataset into train, test and val

  train_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
  test_val = train_dataset["test"].train_test_split(
      test_size=0.5, shuffle=True, seed=42
  )
  train_dataset = train_dataset["train"]
  test_dataset = test_val["test"]
  val_dataset = test_val["train"]
  return train_dataset,test_dataset,val_dataset

In [8]:
train_dataset, test_dataset, val_dataset = generate_dataset_splits(dataset)

In [9]:
import re
def remove_non_text_symbols(text):

  text = re.sub(r'[^\x00-\x7F]+', '', text)

  return text

In [10]:
def clean_dataset(dataset):

  # clean the dataset object
  dataset = dataset.map(lambda example: {"text": remove_non_text_symbols(example["text"])})
  dataset = dataset.filter(lambda example: len(example["text"]) > 0)
  return dataset

In [11]:
train_dataset = clean_dataset(train_dataset)
test_dataset = clean_dataset(test_dataset)
val_dataset = clean_dataset(val_dataset)

Map:   0%|          | 0/11394725 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11394725 [00:00<?, ? examples/s]

Map:   0%|          | 0/633041 [00:00<?, ? examples/s]

Filter:   0%|          | 0/633041 [00:00<?, ? examples/s]

Map:   0%|          | 0/633040 [00:00<?, ? examples/s]

Filter:   0%|          | 0/633040 [00:00<?, ? examples/s]

In [12]:
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [13]:
def tokenize(tokenizer, dataset):
  # tokenize the dataset
  dataset = dataset.map(lambda example: tokenizer(example["text"],padding=True,max_length=256))
  return dataset


In [14]:
class KiswahiliSilabiTokenizer(PreTrainedTokenizerFast):
    def __init__(self, tokenizer,unk_token="[UNK]",sos_token="[SOS]",eos_token="[EOS]",space_token="[SPACE]",pad_token="[PAD]", **kwargs):
        super().__init__(tokenizer_object=tokenizer, **kwargs)
        self._vocab = tokenizer.get_vocab()
        self.unk_token = unk_token
        self.sos_token = sos_token
        self.eos_token = eos_token
        self.space_token = space_token
        self.pad_token = pad_token

      # Add special tokens to vocab if they are not already present
        if self.sos_token not in self._vocab:
            self._vocab[self.sos_token] = len(self._vocab)
        if self.eos_token not in self._vocab:
            self._vocab[self.eos_token] = len(self._vocab)
        if self.unk_token not in self._vocab:
            self._vocab[self.unk_token] = len(self._vocab)
        if self.space_token not in self._vocab:
            self._vocab[self.space_token] = len(self._vocab)
        if self.pad_token not in self._vocab:
            self._vocab[self.pad_token] = len(self._vocab)

    def __call__(self, text,**kwargs):
        ids = self.convert_tokens_to_ids(self.tokenize(text,**kwargs))

        return {"input_ids": ids}

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        tokenizer = Tokenizer.from_file(f"{pretrained_model_name_or_path}/tokenizer.json")
        return cls(tokenizer, **kwargs)

    def _encode_with_byte_fallback(self, text):
        tokens = []
        i = 0
        while i < len(text):
            matched = False
            # Try to match the longest syllable first
            for j in range(len(text), i, -1):
                syllable_candidate = text[i:j]
                if syllable_candidate in self._vocab:
                    tokens.append(syllable_candidate)
                    i = j
                    matched = True
                    break
            # If no syllable matched, fallback to byte encoding
            if not matched:
                if text[i] == " ":
                  tokens.append(self.space_token)
                  i += 1
                else:
                  tokens.extend(self.unk_token)
                  i += 1
        return tokens

    def tokenize(self, text,**kwargs):
        handle_whitespace = kwargs.get("handle_whitespace", True)
        tokens = [self.sos_token]  # Start of sentence token
        for word in text.split(" "):
            tokens.extend(self._encode_with_byte_fallback(word))
            if handle_whitespace:
              tokens.extend(self._encode_with_byte_fallback(" "))
        tokens.append(self.eos_token)  # End of sentence token

        padding = kwargs.get("padding", False)
        if padding:
            max_length = kwargs.get("max_length", None)
            if max_length is not None:
                tokens = tokens[:max_length]
                tokens.extend([self.pad_token] * (max_length - len(tokens)))
            else:
                raise ValueError("max_length must be specified if padding is True")
        return tokens

    def tokens_to_sentence(self,tokens):
      for token in tokens:
        token = token.replace(" ", "")
      sentence = "".join(tokens)
      sentence = sentence.replace(self.eos_token, "")
      sentence = sentence.replace(self.sos_token, "")
      sentence = sentence.replace(self.space_token," ")
      return sentence

In [15]:
silabi_tokenizer = KiswahiliSilabiTokenizer.from_pretrained("silabi_tokenizer")

Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - ipywidgets


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.12.14 |       hf0a4a13_0         153 KB  conda-forge
    certifi-2024.12.14         |     pyhd8ed1ab_0         158 KB  conda-forge
    conda-24.11.2              |  py312h81bd7bf_0         1.1 MB  conda-forge
    ipywidgets-8.1.5           |     pyhd8ed1ab_1         111 KB  conda-forge
    jupyterlab_widgets-3.0.13  |     pyhd8ed1ab_1         182 KB  conda-forge
    libexpat-2.6.3             |       hf9b8971_0          62 KB  conda-forge
    libsqlite-3.46.0           |       hfb93653_0         811 KB  conda-forge
    libzlib-1.2.13             |       hfb2fe0b_6          46 KB  conda-

In [None]:
train_tokenized_dataset = tokenize(silabi_tokenizer, train_dataset)
test_tokenized_dataset = tokenize(silabi_tokenizer, test_dataset)
val_tokenized_dataset = tokenize(silabi_tokenizer, val_dataset)

In [None]:
train_tokenized_dataset.save_to_disk("train_tokenized_dataset")
test_tokenized_dataset.save_to_disk("test_tokenized_dataset")
val_tokenized_dataset.save_to_disk("val_tokenized_dataset")

In [27]:
from datasets import DatasetDict

combined_tokenized_datasets = DatasetDict(
    {
        "train": train_tokenized_dataset,
        "test": test_tokenized_dataset,
        "val": val_tokenized_dataset
    }
)


In [29]:
combined_tokenized_datasets.push_to_hub("swa_syllabic")

Uploading the dataset shards:   0%|          | 0/25 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/416 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/289 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/289 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/289 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/289 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nguthiru/swa_syllabic/commit/fcf9dfc30d571dcc8c9f495a9fa755581d949925', commit_message='Upload dataset', commit_description='', oid='fcf9dfc30d571dcc8c9f495a9fa755581d949925', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/nguthiru/swa_syllabic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='nguthiru/swa_syllabic'), pr_revision=None, pr_num=None)

In [None]:
train_tokenized_dataset.set_format(type="torch", columns=["input_ids"])
val_tokenized_dataset.set_format(type="torch", columns=["input_ids"])
test_tokenized_dataset.set_format(type="torch", columns=["input_ids"])


In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=silabi_tokenizer,
    mlm=False,  # GPT-2 is not trained with masked language modeling
)

In [None]:
from transformers import GPT2Model,GPT2Config, GPT2LMHeadModel

In [None]:
config = GPT2Config(
    vocab_size=silabi_tokenizer.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=768,
    n_layer=12,
    n_head=12,
    bos_token_id=silabi_tokenizer.bos_token_id,
    eos_token_id=silabi_tokenizer.eos_token_id,
)


In [None]:
gpt2_model = GPT2LMHeadModel(config)

In [None]:
!pip install torchinfo

In [None]:
from torchinfo import summary
import torch

In [None]:
summary(gpt2_model, test_tokenized_dataset[0]['input_ids'].shape,dtypes=[torch.long],device="cuda")

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="drive/MyDrive/SwaLLM/GPT2",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = Trainer(
    model=gpt2_model,
    args=training_args,
    train_dataset=test_tokenized_dataset,
    eval_dataset=val_tokenized_dataset,
    processing_class=silabi_tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()