In [None]:
! pip install transformers[torch] datasets tokenizers

In [17]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers import normalizers
from typing import List
import re
import tokenizers
import json
import os

In [2]:
bam_ds = load_dataset("oza75/bambara-texts", split="train")
bam_ds

Dataset({
    features: ['text', 'source_dataset'],
    num_rows: 353926
})

In [4]:
class VoiceBambaraTextPreprocessor:

    def preprocess_batch(self, texts: List[str]) -> List[str]:
        return [self.preprocess(text) for text in texts]

    def preprocess(self, text: str) -> str:
        text = text.lower()
        text = self.expand_number(text)

        return text

    def expand_number(self, text):
        """
        Normalize Bambara text for TTS by replacing numerical figures with their word equivalents.

        Args:
        text (str): The text to be normalized.
    
        Returns:
        str: The normalized Bambara text.
        """

        # A regex pattern to match all numbers
        number_pattern = re.compile(r'\b\d+\b')

        # Function to replace each number with its Bambara text
        def replace_number_with_text(match):
            number = int(match.group())
            return self.number_to_bambara(number)

        # Replace each number in the text with its Bambara word equivalent
        normalized_text = number_pattern.sub(replace_number_with_text, text)

        return normalized_text

    def number_to_bambara(self, n):

        """
        Convert a number into its textual representation in Bambara using recursion.
        Args:
        n (int): The number to be converted.
        Returns:
        str: The number expressed in Bambara text.
        Examples:
        >>> number_to_bambara(123)
        'kɛmɛ ni mugan ni saba'
        Notes:
        This function assumes that 'n' is a non-negative integer.
        """

        # Bambara numbering rules
        units = ["", "kɛlɛn", "fila", "saba", "naani", "duuru", "wɔrɔ", "wòlonwula", "sɛɛgin", "kɔnɔntɔn"]
        tens = ["", "tan", "mugan", "bisaba", "binaani", "biduuru", "biwɔrɔ", "biwòlonfila", "bisɛɛgin", "bikɔnɔntɔn"]
        hundreds = ["", "kɛmɛ"]
        thousands = ["", "waga"]
        millions = ["", "milyɔn"]

        # Handle zero explicitly
        if n == 0:
            return ""  # bambara does not support zero

        if n < 10:
            return units[n]
        elif n < 100:
            return tens[n // 10] + (" ni " + self.number_to_bambara(n % 10) if n % 10 > 0 else "")
        elif n < 1000:
            return hundreds[1] + (" " + self.number_to_bambara(n // 100) if n >= 200 else "") + (" ni " + self.number_to_bambara(n % 100) if n % 100 > 0 else "")
        elif n < 1_000_000:
            return thousands[1] + " " + self.number_to_bambara(n // 1000) + (
                " ni " + self.number_to_bambara(n % 1000) if n % 1000 > 0 else "")
        else:
            return millions[1] + " " + self.number_to_bambara(n // 1_000_000) + (
                " ni " + self.number_to_bambara(n % 1_000_000) if n % 1_000_000 > 0 else "")

In [20]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=2000,
    special_tokens=["[STOP]", "[UNK]", "[SPACE]", "[START]", "[bm]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

text_preprocessor = VoiceBambaraTextPreprocessor()

In [21]:
def batch_iterator(batch_size=1000):
    for i in range(0, len(bam_ds), batch_size):
        yield text_preprocessor.preprocess_batch(bam_ds[i: i + batch_size]["text"])

In [22]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(bam_ds))



In [23]:
tokenizer.save("./saved/bam_vocab.json")

In [24]:
outputs = tokenizer.encode_batch(text_preprocessor.preprocess_batch(bam_ds['text'][:10]))
outputs

[Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=4, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=18, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=15, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=7, attributes

In [25]:
outputs[5].tokens

['bi',
 ',',
 's',
 'ur',
 'ɔ',
 'fana',
 'dun',
 'nen',
 'kɔ',
 ',',
 'n',
 'bɛ',
 'na',
 'an',
 'ka',
 'baro',
 'kɛ',
 '.']

In [26]:
outputs = tokenizer.encode(text_preprocessor.preprocess("Ɔ̀Ɔ̀ wɔ́, ní dɔ́ bólokòra, nùmukɛ b'à fɔ́ kó, dɔ́ ka nà"))
outputs.tokens

['ɔ',
 '̀',
 'ɔ',
 '̀',
 'wɔ',
 '́',
 ',',
 'n',
 'í',
 'dɔ',
 '́',
 'b',
 'ó',
 'lo',
 'k',
 'ò',
 'ra',
 ',',
 'n',
 'ù',
 'mu',
 'kɛ',
 'b',
 "'",
 'à',
 'fɔ',
 '́',
 'k',
 'ó',
 ',',
 'dɔ',
 '́',
 'ka',
 'n',
 'à']

In [28]:
def integrate_vocabs(main_vocab_path, bam_vocab_path, output_dir):
    # Load the main vocabulary
    with open(main_vocab_path, 'r', encoding='utf-8') as f:
        main_vocab = json.load(f)
    main_tokens = set(main_vocab['model']['vocab'].keys())
    next_id = max(main_vocab['model']['vocab'].values()) + 1

    # Load the Bambara vocabulary
    with open(bam_vocab_path, 'r', encoding='utf-8') as f:
        bam_vocab = json.load(f)
    bam_tokens = set(bam_vocab['model']['vocab'].keys())
    
    # Add tokens from bam_vocab to main_vocab if they don't exist
    for token in bam_tokens:
        if token not in main_tokens:
            main_vocab['model']['vocab'][token] = next_id
            next_id += 1
    
    # Now for the merges
    main_merges = set(main_vocab['model']['merges'])
    bam_merges = set(bam_vocab['model']['merges'])

    # Add merges from bam_vocab to main_vocab if they don't exist
    for merge in bam_merges:
        if merge not in main_merges:
            main_vocab['model']['merges'].append(merge)

    # Save the updated vocabulary
    output_vocab_path = os.path.join(output_dir, 'combined_vocab.json')
    with open(output_vocab_path, 'w', encoding='utf-8') as f:
        json.dump(main_vocab, f, ensure_ascii=False, indent=2)

    print(f"Updated vocabulary saved to {output_vocab_path}")
    return output_vocab_path

In [29]:
# Specify the paths to your main and Bambara vocab files
main_vocab_path = './saved/xtts_default_vocab.json'
bam_vocab_path = './saved/bam_vocab.json'
output_dir = './saved'

# Integrate the Bambara vocab into the main vocab and save the updated vocab
updated_vocab_path = integrate_vocabs(main_vocab_path, bam_vocab_path, output_dir)

Updated vocabulary saved to ./saved/combined_vocab.json


In [30]:
combined_tokenizer = Tokenizer.from_file("./saved/combined_vocab.json")

In [35]:
combined_tokenizer.encode("Nin Avrili kalo daminɛ na Farafinna tilebiyanfan jamana dɔw la futɛni barika bonyan fo ka dama tɛmɛ.").ids

[5773,
 41,
 5760,
 2839,
 1127,
 7738,
 6047,
 6888,
 467,
 5765,
 59,
 2778,
 691,
 14,
 6878,
 15,
 1969,
 43,
 456,
 27,
 941,
 1270,
 7312,
 494,
 1153,
 7289,
 832,
 512,
 650,
 14,
 165,
 2351,
 43,
 182,
 571,
 7496,
 7798,
 9]