# Prepare Malaya-Speech Dataset

We want our model able to understand Bahasa and local english slang (Manglish).

### Download data

Run command below to download all data,

```bash
wget https://f000.backblazeb2.com/file/malay-dataset/speech/semisupervised-manglish.tar.gz
tar -xf semisupervised-manglish.tar.gz

wget https://cdn.commonvoice.mozilla.org/cv-corpus-5.1-2020-06-22/id.tar.gz
tar -zxf id.tar.gz

wget https://f000.backblazeb2.com/file/malay-dataset/speech/semisupervised-malay.tar.gz
tar -xf semisupervised-malay.tar.gz

wget https://f000.backblazeb2.com/file/malay-dataset/streaming.zip -O wikipedia-asr.zip
unzip wikipedia-asr.zip

wget https://f000.backblazeb2.com/file/malay-dataset/speech/iium/iium.json
mkdir iium
wget https://f000.backblazeb2.com/file/malay-dataset/speech/iium/streaming.zip -O iium-asr.zip
unzip iium-asr.zip -d iium
```

Total samples length,

1. Malay, ~93 hours, semisupervised.
2. Manglish, ~107 hours, semisupervised.
3. Wikipedia malay, ~3.4 hours, supervised.
4. IIUM confession malay, ~2.4 hours, supervised.
3. Indonesian, ~4 hours, supervised.

In [1]:
# !pip3 install malaya-speech -U --no-deps

### Read data

In [27]:
import pandas as pd
import malaya_speech
import malaya_speech.train as train
from glob import glob
import json
import os

In [3]:
df = pd.read_csv('cv-corpus-5.1-2020-06-22/id/validated.tsv', sep = '\t')
df = df[(df['sentence'].str.len() > 5) & (df['sentence'].str.count(' ') > 0)]
print(df.shape)

(7490, 10)


In [4]:
id_commonvoice = []
for i in range(len(df)):
    p = f"cv-corpus-5.1-2020-06-22/id/clips/{df['path'].iloc[i]}"
    id_commonvoice.append((p, df['sentence'].iloc[i]))

len(id_commonvoice)

7490

In [5]:
from glob import glob

malay = glob('semisupervised-malay/output-wav/*.wav')
manglish = glob('semisupervised-manglish/output-wav/*.wav')
len(malay), len(manglish)

(57895, 65277)

In [36]:
import unicodedata
import re

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "'", "-", "z", "0", "1", "x", "2", "q", "*", "5", "3", "4", "6", "9", "8", "7", "%", "$", "\"", "/", "&", ":", "+"]
def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c for c in string if c in vocabs])
    return re.sub(r'[ ]+', ' ', string).strip()

In [7]:
from tqdm import tqdm

malays = []
for i in tqdm(malay):
    try:
        p = i.replace('output-wav','output-text')
        with open(f'{p}.txt') as fopen:
            text = fopen.read()
        malays.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 57895/57895 [00:01<00:00, 35213.66it/s]


In [8]:
manglishs = []
for i in tqdm(manglish):
    try:
        p = i.replace('output-wav','output-text')
        with open(f'{p}.txt') as fopen:
            text = fopen.read()
        manglishs.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 65277/65277 [00:01<00:00, 34909.82it/s]


In [22]:
with open('iium.json') as fopen:
    iium = json.load(fopen)

iiums = []
wavs = glob('iium/streaming/*.wav')
for i in tqdm(wavs):
    try:
        index = int(i.split('/')[-1].split('.wav')[0])
        iiums.append((i, iium[index]))
    except Exception as e:
        pass
    
len(iiums)

100%|██████████| 2251/2251 [00:00<00:00, 606032.37it/s]


1803

In [30]:
wikipedia = []
wavs = glob('streaming/*wav')
for i in tqdm(wavs):
    text = os.path.split(i)[1].replace('.wav', '')
    wikipedia.append((i, text))
    
len(wikipedia)

100%|██████████| 2887/2887 [00:00<00:00, 376159.66it/s]


2887

In [31]:
audios = id_commonvoice + malays + manglishs + iiums + wikipedia
audios, texts = zip(*audios)

In [32]:
from pydub import AudioSegment
import numpy as np

def mp3_to_wav(file, sr = 16000):
    audio = AudioSegment.from_file(file)
    audio = audio.set_frame_rate(sr).set_channels(1)
    sample = np.array(audio.get_array_of_samples())
    return malaya_speech.astype.int_to_float(sample), sr

In [37]:
cleaned_texts = [preprocessing_text(t) for t in texts]

In [38]:
with open('malaya-speech-transcript.json', 'w') as fopen:
    json.dump(cleaned_texts, fopen)
    
with open('malaya-speech-transcript.txt', 'w') as fopen:
    fopen.write('\n'.join(cleaned_texts))

In [39]:
unique_chars = malaya_speech.char.generate_vocab(cleaned_texts)
unique_chars

['<PAD>',
 '<EOS>',
 ' ',
 'a',
 'e',
 'n',
 'i',
 't',
 'u',
 's',
 'o',
 'k',
 'r',
 'l',
 'h',
 'd',
 'm',
 'g',
 'y',
 'b',
 'p',
 'w',
 'c',
 'f',
 'j',
 'v',
 "'",
 '-',
 'z',
 '0',
 '1',
 'x',
 '2',
 'q',
 '*',
 '5',
 '3',
 '4',
 '6',
 '9',
 '8',
 '7',
 '%',
 '"',
 '$',
 '/',
 '&',
 ':',
 '+']

In [41]:
with open('malaya-speech-sst-vocab.json', 'w') as fopen:
    json.dump(unique_chars, fopen)

### Change into TFRecord

This is not necessary step, we recommend to use yield iterator to train the model, but we also can save our data into TFRecord to speed up data pipelines. To do that, we need to create a yield iterator.

In [42]:
def generator(maxlen = 16):
    for i in tqdm(range(len(audios))):
        try:
            if '.mp3' in audios[i]:
                wav_data, sr = mp3_to_wav(audios[i])
            else:
                wav_data, sr = malaya_speech.load(audios[i])
                
            if (len(wav_data) / sr) > maxlen or len(cleaned_texts[i]) < 5:
                print(f'skipped {audios[i]}')
                continue

            yield {
                'waveforms': wav_data.tolist(),
                'waveform_lens': [len(wav_data)],
                'targets': malaya_speech.char.encode(cleaned_texts[i], add_eos = False,
                                                     lookup = unique_chars),
                'raw_transcript': [cleaned_texts[i]],
            }
        except Exception as e:
            print(e)
            
generator = generator()

In [43]:
import os
import tensorflow as tf

os.system('rm bahasa-asr/data/*')
DATA_DIR = os.path.expanduser('bahasa-asr/data')
tf.gfile.MakeDirs(DATA_DIR)

#### Define shards

Like we defined below,

```python
shards = [{'split': 'train', 'shards': 999}, {'split': 'dev', 'shards': 1}]
```

In [44]:
shards = [{'split': 'train', 'shards': 999}, {'split': 'dev', 'shards': 1}]

#### Save to TFRecord

Just pass yield iterator to malaya_speech.train_prepare_dataset,

```python
def prepare_dataset(
    generator,
    data_dir: str,
    shards: List[Dict],
    prefix: str = 'dataset',
    shuffle: bool = True,
    already_shuffled: bool = False,
):
```

In [47]:
train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'bahasa-asr')