In [None]:
! pip install transformers[torch] datasets tokenizers

In [1]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers import normalizers
from typing import List
import re
import tokenizers

In [2]:
bam_ds = load_dataset("oza75/bambara-texts", split="train")
bam_ds

Dataset({
    features: ['text', 'source_dataset'],
    num_rows: 353926
})

In [4]:
class VoiceBambaraTextPreprocessor:

    def preprocess_batch(self, texts: List[str]) -> List[str]:
        return [self.preprocess(text) for text in texts]

    def preprocess(self, text: str) -> str:
        text = text.lower()
        text = self.expand_number(text)

        return text

    def expand_number(self, text):
        """
        Normalize Bambara text for TTS by replacing numerical figures with their word equivalents.

        Args:
        text (str): The text to be normalized.
    
        Returns:
        str: The normalized Bambara text.
        """

        # A regex pattern to match all numbers
        number_pattern = re.compile(r'\b\d+\b')

        # Function to replace each number with its Bambara text
        def replace_number_with_text(match):
            number = int(match.group())
            return self.number_to_bambara(number)

        # Replace each number in the text with its Bambara word equivalent
        normalized_text = number_pattern.sub(replace_number_with_text, text)

        return normalized_text

    def number_to_bambara(self, n):

        """
        Convert a number into its textual representation in Bambara using recursion.
        Args:
        n (int): The number to be converted.
        Returns:
        str: The number expressed in Bambara text.
        Examples:
        >>> number_to_bambara(123)
        'kɛmɛ ni mugan ni saba'
        Notes:
        This function assumes that 'n' is a non-negative integer.
        """

        # Bambara numbering rules
        units = ["", "kɛlɛn", "fila", "saba", "naani", "duuru", "wɔrɔ", "wòlonwula", "sɛɛgin", "kɔnɔntɔn"]
        tens = ["", "tan", "mugan", "bisaba", "binaani", "biduuru", "biwɔrɔ", "biwòlonfila", "bisɛɛgin", "bikɔnɔntɔn"]
        hundreds = ["", "kɛmɛ"]
        thousands = ["", "waga"]
        millions = ["", "milyɔn"]

        # Handle zero explicitly
        if n == 0:
            return ""  # bambara does not support zero

        if n < 10:
            return units[n]
        elif n < 100:
            return tens[n // 10] + (" ni " + self.number_to_bambara(n % 10) if n % 10 > 0 else "")
        elif n < 1000:
            return hundreds[1] + (" " + self.number_to_bambara(n // 100) if n >= 200 else "") + (" ni " + self.number_to_bambara(n % 100) if n % 100 > 0 else "")
        elif n < 1_000_000:
            return thousands[1] + " " + self.number_to_bambara(n // 1000) + (
                " ni " + self.number_to_bambara(n % 1000) if n % 1000 > 0 else "")
        else:
            return millions[1] + " " + self.number_to_bambara(n // 1_000_000) + (
                " ni " + self.number_to_bambara(n % 1_000_000) if n % 1_000_000 > 0 else "")

In [5]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=6681,
    special_tokens=["[STOP]", "[UNK]", "[SPACE]", "[START]", "[bm]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

text_preprocessor = VoiceBambaraTextPreprocessor()

In [7]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(bam_ds), batch_size):
        yield text_preprocessor.preprocess_batch(bam_ds[i: i + batch_size]["text"])

In [8]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(bam_ds))



In [9]:
tokenizer.save("./saved/vocab.json")

In [10]:
outputs = tokenizer.encode_batch(text_preprocessor.preprocess_batch(bam_ds['text'][:10]))
outputs

[Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=2, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=7, attributes=[i

In [14]:
outputs[5].tokens

['bi',
 ',',
 'surɔ',
 'fana',
 'dun',
 'nen',
 'kɔ',
 ',',
 'n',
 'bɛ',
 'na',
 'an',
 'ka',
 'baro',
 'kɛ',
 '.']

In [148]:
outputs = tokenizer.encode(text_preprocessor.preprocess("Ɔ̀Ɔ̀ wɔ́, ní dɔ́ bólokòra, nùmukɛ b'à fɔ́ kó, dɔ́ ka nà"))
outputs.tokens

['ɔ̀',
 'ɔ̀',
 'wɔ́',
 ',',
 'n',
 'í',
 'dɔ́',
 'b',
 'ó',
 'lo',
 'k',
 'ò',
 'ra',
 ',',
 'n',
 'ù',
 'mu',
 'kɛ',
 'b',
 "'",
 'à',
 'fɔ́',
 'k',
 'ó',
 ',',
 'dɔ́',
 'ka',
 'n',
 'à']

In [149]:
outputs.ids

[14017,
 14017,
 6414,
 19,
 59,
 101,
 4058,
 47,
 106,
 387,
 56,
 105,
 369,
 19,
 59,
 111,
 451,
 368,
 47,
 14,
 89,
 3224,
 56,
 106,
 19,
 4058,
 361,
 59,
 89]