# Prepare Malaya-Speech Dataset

We want our model able to understand Bahasa and local english slang (Manglish).

### Download data

Run command below to download all data,

```bash
wget https://cdn.commonvoice.mozilla.org/cv-corpus-5.1-2020-06-22/id.tar.gz
tar -zxf id.tar.gz

wget https://f000.backblazeb2.com/file/malay-dataset/speech/semisupervised-malay.tar.gz
tar -xf semisupervised-malay.tar.gz

https://f000.backblazeb2.com/file/malay-dataset/speech/semisupervised-malay-part2.tar.gz
tar -xf semisupervised-malay-part2.tar.gz

https://f000.backblazeb2.com/file/malay-dataset/speech/semisupervised-malay-part3.tar.gz
tar -xf semisupervised-malay-part3.tar.gz

wget https://f000.backblazeb2.com/file/malay-dataset/streaming.zip -O wikipedia-asr.zip
unzip wikipedia-asr.zip
```

Total samples length,

1. Malay, ~222 hours, semisupervised.
2. Wikipedia malay, ~3.4 hours, supervised.
3. Indonesian, ~4 hours, supervised.

In [2]:
# !pip3 install malaya-speech -U --no-deps

### Read data

In [3]:
import pandas as pd
import malaya_speech
import malaya_speech.train as train
from glob import glob
import json
import os






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [4]:
df = pd.read_csv('cv-corpus-5.1-2020-06-22/id/validated.tsv', sep = '\t')
df = df[(df['sentence'].str.len() > 5) & (df['sentence'].str.count(' ') > 0)]
print(df.shape)

(7490, 10)


In [5]:
id_commonvoice = []
for i in range(len(df)):
    p = f"cv-corpus-5.1-2020-06-22/id/clips/{df['path'].iloc[i]}"
    id_commonvoice.append((p, df['sentence'].iloc[i]))

len(id_commonvoice)

7490

In [9]:
from glob import glob

malay = glob('semisupervised-malay/output-wav/*.wav')
malay.extend(glob('../youtube/malay/output-wav/*.wav'))
malay.extend(glob('../youtube/malay2/output-wav/*.wav'))
len(malay)

136520

In [10]:
import unicodedata
import re

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "'", "-", "z", "0", "1", "x", "2", "q", "*", "5", "3", "4", "6", "9", "8", "7", "%", "$", "\"", "/", "&", ":", "+"]
def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c for c in string if c in vocabs])
    return re.sub(r'[ ]+', ' ', string).strip()

In [11]:
from tqdm import tqdm

malays = []
for i in tqdm(malay):
    try:
        p = i.replace('output-wav','output-text')
        with open(f'{p}.txt') as fopen:
            text = fopen.read()
        malays.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 136520/136520 [00:11<00:00, 11678.38it/s]


In [14]:
wikipedia = []
wavs = glob('streaming/*wav')
for i in tqdm(wavs):
    text = os.path.split(i)[1].replace('.wav', '')
    wikipedia.append((i, text))
    
len(wikipedia)

100%|██████████| 2887/2887 [00:00<00:00, 364672.66it/s]


2887

In [15]:
audios = id_commonvoice + malays + wikipedia
audios, texts = zip(*audios)

In [16]:
from pydub import AudioSegment
import numpy as np

def mp3_to_wav(file, sr = 16000):
    audio = AudioSegment.from_file(file)
    audio = audio.set_frame_rate(sr).set_channels(1)
    sample = np.array(audio.get_array_of_samples())
    return malaya_speech.astype.int_to_float(sample), sr

In [17]:
cleaned_texts = [preprocessing_text(t) for t in texts]

In [18]:
with open('malaya-speech-transcript.json', 'w') as fopen:
    json.dump(cleaned_texts, fopen)
    
with open('malaya-speech-transcript.txt', 'w') as fopen:
    fopen.write('\n'.join(cleaned_texts))

In [19]:
unique_chars = malaya_speech.char.generate_vocab(cleaned_texts)
unique_chars

['<PAD>',
 '<EOS>',
 'a',
 ' ',
 'n',
 'i',
 'e',
 'k',
 't',
 'u',
 'd',
 'm',
 'l',
 's',
 'r',
 'g',
 'b',
 'p',
 'h',
 'o',
 'y',
 'j',
 'c',
 'w',
 '-',
 'f',
 'v',
 '0',
 'z',
 '1',
 '2',
 '3',
 '5',
 'q',
 '4',
 'x',
 '6',
 '9',
 '7',
 "'",
 '8',
 '*',
 '"',
 '$',
 '&',
 ':']

In [20]:
with open('malaya-speech-sst-vocab.json', 'w') as fopen:
    json.dump(unique_chars, fopen)

### Change into TFRecord

This is not necessary step, we recommend to use yield iterator to train the model, but we also can save our data into TFRecord to speed up data pipelines. To do that, we need to create a yield iterator.

In [21]:
def generator(maxlen = 18):
    for i in tqdm(range(len(audios))):
        try:
            if '.mp3' in audios[i]:
                wav_data, sr = mp3_to_wav(audios[i])
            else:
                wav_data, sr = malaya_speech.load(audios[i])
                
            if (len(wav_data) / sr) > maxlen or len(cleaned_texts[i]) < 5:
                print(f'skipped {audios[i]}')
                continue

            yield {
                'waveforms': wav_data.tolist(),
                'waveform_lens': [len(wav_data)],
                'targets': malaya_speech.char.encode(cleaned_texts[i], add_eos = False,
                                                     lookup = unique_chars),
                'raw_transcript': [cleaned_texts[i]],
            }
        except Exception as e:
            print(e)
            
generator = generator()

In [22]:
import os
import tensorflow as tf

os.system('rm bahasa-asr/data/*')
DATA_DIR = os.path.expanduser('bahasa-asr/data')
tf.gfile.MakeDirs(DATA_DIR)

#### Define shards

Like we defined below,

```python
shards = [{'split': 'train', 'shards': 999}, {'split': 'dev', 'shards': 1}]
```

In [23]:
shards = [{'split': 'train', 'shards': 999}, {'split': 'dev', 'shards': 1}]

#### Save to TFRecord

Just pass yield iterator to malaya_speech.train_prepare_dataset,

```python
def prepare_dataset(
    generator,
    data_dir: str,
    shards: List[Dict],
    prefix: str = 'dataset',
    shuffle: bool = True,
    already_shuffled: bool = False,
):
```

In [25]:
train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'bahasa-asr')