# Notebook for preprocessing Malay dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml"
config = yaml.safe_load(open(config_path))

In [2]:
from phonemize import phonemize

`openai-whisper` is not available, native whisper processor is not available, will use huggingface processor instead.


In [3]:
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='ms', preserve_punctuation=True,  with_stress=True)

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer'])

### Process dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.ms")['train']

In [17]:
!rm -rf wiki_phoneme

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
root_directory = "./wiki_phoneme"

In [16]:
import os
num_shards = 1000

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [19]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

In [27]:
max_workers = 20

with ProcessPool(max_workers=max_workers) as pool:
    pool.map(process_shard, range(num_shards))

### Collect all shards to form the processed dataset

In [28]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
    except:
        continue

In [22]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/3 shards):   0%|          | 0/366783 [00:00<?, ? examples/s]

Dataset saved to wikipedia_20220301.en.processed


In [23]:
!du -hs wikipedia_20220301.en.processed

1.1G	wikipedia_20220301.en.processed


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [36]:
news_dataset = load_from_disk('news')

In [37]:
dataset = concatenate_datasets([dataset, news_dataset])
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 1571960
})

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [38]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=10, batch_size=1)

In [39]:
special_token = config['dataset_params']['word_separator']

In [40]:
token_maps = {}
for k, v in tokenizer.vocab.items():
    token_maps[v] = {'word': k, 'token': v}

In [42]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


In [43]:
with open('token_maps.pkl', 'rb') as fopen:
    token_maps = pickle.load(fopen)

In [44]:
len(token_maps)

300000

### Test the dataset with dataloader


In [57]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=10, num_workers=0, dataset_config=config['dataset_params'])

177


In [58]:
phonemes = dataset[1]['phonemes']
input_ids = dataset[1]['input_ids']

In [59]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [60]:
words.shape

torch.Size([10, 238])

In [64]:
words[0]

tensor([ 3778,  3778,  3778,  3778,  3778,  3778,     2,   479,   479,   479,
          479,   479,   479,   479,   479,     2,    31,    31,    31,    31,
           31,    31,    31,    31,    31,    31,     2,    32,    32,    32,
           32,    32,    32,    32,     2, 10606, 10606, 10606, 10606, 10606,
        10606, 10606,     2,    11,    11,    11,     2,   151,   151,   151,
          151,   151,   151,   151,   151,   151,   151,     2,    45,    45,
            2,    18,    18,    18,    18,    18,    18,     2,    73,    73,
           73,    73,    73,    73,    73,    73,     2,   169,   169,   169,
          169,   169,   169,   169,   169,   169,   169,   169,     2,  6361,
         6361,  6361,  6361,  6361,  6361,     2,     5,     2,   245,   245,
          245,   245,   245,   245,   245,   245,   245,   245,   245,     2,
         3185,  3185,  3185,  3185,  3185,  3185,  3185,  3185,     2,   113,
          113,   113,   113,   113,   113,   113,   113,     2, 

In [65]:
tokenizer.decode(words[0])

'gurun gurun gurun gurun gurun gurun [SEP] panjang panjang panjang panjang panjang panjang panjang panjang [SEP] merupakan merupakan merupakan merupakan merupakan merupakan merupakan merupakan merupakan merupakan [SEP] sebuah sebuah sebuah sebuah sebuah sebuah sebuah [SEP] nagari nagari nagari nagari nagari nagari nagari [SEP] yang yang yang [SEP] termasuk termasuk termasuk termasuk termasuk termasuk termasuk termasuk termasuk termasuk [SEP] ke ke [SEP] dalam dalam dalam dalam dalam dalam [SEP] wilayah wilayah wilayah wilayah wilayah wilayah wilayah wilayah [SEP] kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan kecamatan [SEP] bayang bayang bayang bayang bayang bayang [SEP] , [SEP] kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten kabupaten [SEP] pesisir pesisir pesisir pesisir pesisir pesisir pesisir pesisir [SEP] selatan selatan selatan selatan selatan selatan selatan selatan [SEP] , 

In [66]:
phonemes

tensor([[ 92, 156,  63,  ...,  43,  56,  16],
        [ 61,  83,  55,  ...,  16,   0,   0],
        [ 61,  83,  55,  ...,   0,   0,   0],
        ...,
        [ 53,  83,  46,  ...,   0,   0,   0],
        [ 92,  43,  44,  ...,   0,   0,   0],
        [ 56,  83, 156,  ...,   0,   0,   0]])

In [77]:
from text_utils import dicts

rev_dicts = {v: k for k, v in dicts.items()}

In [80]:
''.join([rev_dicts[int(i)] for i in phonemes[1]])

'səməntˈarə ˈitu , bəlˈiaʊ MMM MMMMMM ˈahli parlˈimən MMMMMMM pˈantaɪ tˈurot mənˌasihˈatkan kaˈaə MMMMMM MMMMMMMMM , stʃˈam , ˈuntoʔ məlapˈɔrkan kədʒadˈian jaŋ mənˈimpə mərˈɛkə dan tˈidaʔ mˈalu ˈuntoʔ bərhˈuboŋ miˈaaə pˈihaʔ bərkuˈasə . $$'

In [84]:
dataset.save_to_disk(config['data_folder'])

Saving the dataset (0/4 shards):   0%|          | 0/1571960 [00:00<?, ? examples/s]

In [82]:
dataset.push_to_hub('mesolitica/PL-BERT-MS')

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/393 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/393 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/393 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/393 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/PL-BERT-MS/commit/41caa94b49727e8f10248a683508cc8d4b8ef34f', commit_message='Upload dataset', commit_description='', oid='41caa94b49727e8f10248a683508cc8d4b8ef34f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/PL-BERT-MS', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/PL-BERT-MS'), pr_revision=None, pr_num=None)

In [1]:
from datasets import load_dataset

dataset = load_dataset("mesolitica/PL-BERT-MS", split="train")

README.md:   0%|          | 0.00/802 [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/322M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/54.9M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/57.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1571960 [00:00<?, ? examples/s]

In [3]:
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 1571960
})

In [4]:
from datasets import load_from_disk

dataset2 = load_from_disk('wikipedia_20220301.en.processed')
dataset2

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 1571960
})

In [5]:
dataset[0]

{'id': '17218',
 'url': 'https://ms.wikipedia.org/wiki/Pelabuhan%20Kerteh',
 'title': 'Pelabuhan Kerteh',
 'input_ids': [1244,
  10745,
  441,
  7,
  10745,
  5,
  367,
  76,
  2507,
  1275,
  3036,
  4475,
  9,
  1154,
  647,
  11,
  10713,
  7,
  48,
  530,
  367,
  9,
  11,
  150356,
  8040,
  7,
  40,
  3841,
  10745,
  6,
  1244,
  7,
  43],
 'phonemes': ['pəlabˈuhan',
  'kˈɛɾrteh',
  'dibˈinə',
  'di',
  'kˈɛɾrteh',
  ',',
  'trəŋɡˈanu',
  'bˈaɡi',
  'mənˈampoŋ',
  'kəpəɾrlˈuan',
  'pəŋhantˈaran',
  'pətrolˈɛom',
  'dan',
  'ɡˈas',
  'ˈasli',
  'jaŋ',
  'diɡˈali',
  'di',
  'lˈuar',
  'pˈantaɪ',
  'trəŋɡˈanu',
  'dan',
  'jaŋ',
  'ditˈɛlah',
  'diprˈoses',
  'di',
  'kawˈasan',
  'darˈatan',
  'kˈɛɾrteh',
  '.',
  'pəlabˈuhan',
  'di',
  'məlˈesiə']}

In [6]:
dataset2[0]

{'id': '17218',
 'url': 'https://ms.wikipedia.org/wiki/Pelabuhan%20Kerteh',
 'title': 'Pelabuhan Kerteh',
 'input_ids': [1244,
  10745,
  441,
  7,
  10745,
  5,
  367,
  76,
  2507,
  1275,
  3036,
  4475,
  9,
  1154,
  647,
  11,
  10713,
  7,
  48,
  530,
  367,
  9,
  11,
  150356,
  8040,
  7,
  40,
  3841,
  10745,
  6,
  1244,
  7,
  43],
 'phonemes': ['pəlabˈuhan',
  'kˈɛɾrteh',
  'dibˈinə',
  'di',
  'kˈɛɾrteh',
  ',',
  'trəŋɡˈanu',
  'bˈaɡi',
  'mənˈampoŋ',
  'kəpəɾrlˈuan',
  'pəŋhantˈaran',
  'pətrolˈɛom',
  'dan',
  'ɡˈas',
  'ˈasli',
  'jaŋ',
  'diɡˈali',
  'di',
  'lˈuar',
  'pˈantaɪ',
  'trəŋɡˈanu',
  'dan',
  'jaŋ',
  'ditˈɛlah',
  'diprˈoses',
  'di',
  'kawˈasan',
  'darˈatan',
  'kˈɛɾrteh',
  '.',
  'pəlabˈuhan',
  'di',
  'məlˈesiə']}