# Prepare Malaya-Speech Test Dataset

Test set to convert into tfrecord.

### Download data

Run command below to download all data,

```bash
wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-iium.zip
wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/shuffled-iium.json
unzip audio-iium.zip -d iium

wget https://f000.backblazeb2.com/file/malaya-speech-model/data/audio-wattpad.zip
wget https://f000.backblazeb2.com/file/malaya-speech-model/collections/transcript-wattpad.json
unzip audio-wattpad.zip -d wattpad

wget https://f000.backblazeb2.com/file/malaya-speech-model/data/testset-audiobook.tar.gz
tar -zxf text-audiobook.tar.gz
tar -xf testset-audiobook.tar.gz
```

Total samples length,

1. Strong semisupervised malay audiobook, ~30 mins, strong semisupervised.
2. iium, ~6 mins, supervised.
3. wattpad, ~10 mins, supervised.

In [21]:
import pandas as pd
import malaya_speech
import malaya_speech.train as train
from glob import glob
import json
import os
from tqdm import tqdm

In [17]:
import unicodedata
import re

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "'", "-", "z", "0", "1", "x", "2", "q", "*", "5", "3", "4", "6", "9", "8", "7", "%", "$", "\"", "/", "&", ":", "+"]
def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c for c in string if c in vocabs])
    return re.sub(r'[ ]+', ' ', string).strip()

In [23]:
wattpad = []
wavs = glob('wattpad/audio-wattpad/*wav')

with open('transcript-wattpad.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    wattpad.append((i, text))

100%|██████████| 146/146 [00:00<00:00, 289262.34it/s]


In [25]:
iium = []
wavs = glob('iium/audio-iium/*wav')

with open('shuffled-iium.json') as fopen:
    transcript = json.load(fopen)
    
for i in tqdm(wavs):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript[int(index)]
    iium.append((i, text))

100%|██████████| 97/97 [00:00<00:00, 299372.69it/s]


In [26]:
audiobook = []
wavs = glob('test-set/*wav')
for i in tqdm(wavs):
    t = '/'.join(i.split('<>')[1:])
    t = t.split('.wav')[0]
    t = t.replace('output-wav', 'output-text')
    with open(f'text-audiobook/{t}.wav.txt') as fopen:
        text = fopen.read()
    audiobook.append((i, text))

100%|██████████| 300/300 [00:00<00:00, 27473.61it/s]


In [28]:
audios = wattpad + iium + audiobook
audios, texts = zip(*audios)

In [29]:
cleaned_texts = [preprocessing_text(t) for t in texts]

In [34]:
with open('malaya-speech-sst-vocab.json') as fopen:
    unique_chars = json.load(fopen)

In [35]:
def generator(maxlen = 18, min_length_text = 0):
    for i in tqdm(range(len(audios))):
        try:
            wav_data, sr = malaya_speech.load(audios[i])
                
            if (len(wav_data) / sr) > maxlen:
                print(f'skipped audio too long {audios[i]}')
                continue
                
            if len(cleaned_texts[i]) < min_length_text:
                print(f'skipped text too short {audios[i]}')
                continue    

            yield {
                'waveforms': wav_data.tolist(),
                'waveform_lens': [len(wav_data)],
                'targets': malaya_speech.char.encode(cleaned_texts[i], add_eos = False,
                                                     lookup = unique_chars),
                'raw_transcript': [cleaned_texts[i]],
            }
        except Exception as e:
            print(e)
            
generator = generator()

In [36]:
import os
import tensorflow as tf

os.system('rm bahasa-asr-test/data/*')
DATA_DIR = os.path.expanduser('bahasa-asr-test/data')
tf.gfile.MakeDirs(DATA_DIR)

In [37]:
shards = [{'split': 'dev', 'shards': 100}]

In [38]:
train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'bahasa-asr')

  0%|          | 0/543 [00:00<?, ?it/s]

INFO:tensorflow:Generating case 0.


100%|██████████| 543/543 [00:12<00:00, 41.90it/s]

INFO:tensorflow:Generated 543 Examples
INFO:tensorflow:Shuffling data...





INFO:tensorflow:Data shuffled.
