In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import json
import string

In [2]:
vocabs = list(string.ascii_lowercase + string.digits) + [' ']

In [3]:
malay = glob('part*/output-wav/*.wav')
len(malay)

744630

In [4]:
malay.extend(glob('part*/semisupervised/output-wav/*.wav'))
len(malay)

1100521

In [5]:
khalil = glob('tolong-sebut/*.wav')
mas = glob('sebut-perkataan-woman/*.wav')
husein = glob('sebut-perkataan-man/*.wav')
len(khalil), len(mas), len(husein)

(565, 200, 698)

In [6]:
malays = []
for i in tqdm(malay):
    try:
        p = i.replace('output-wav','output-text')
        with open(f'{p}.txt') as fopen:
            text = fopen.read()
        if len(text) < 3:
            continue
        malays.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 1100521/1100521 [14:43<00:00, 1246.17it/s]


In [8]:
import os

wikipedia = []
wavs = glob(f'streaming/*wav')
for i in tqdm(wavs[:-int(len(wavs) * 0.05)]):
    text = os.path.split(i)[1].replace('.wav', '')
    wikipedia.append((i, text))
    
len(wikipedia)

100%|██████████| 2743/2743 [00:00<00:00, 411671.23it/s]


2743

In [9]:
news = []
wavs = glob(f'news/audio/*wav')

with open(f'transcript-news.json') as fopen:
    transcript_news = json.load(fopen)
    
for i in tqdm(wavs[:-int(len(wavs) * 0.05)]):
    index = i.split('/')[-1].replace('.wav','')
    text = transcript_news[int(index)]
    news.append((i, text))

100%|██████████| 2044/2044 [00:00<00:00, 69113.84it/s]


In [11]:
audiobook = []
wavs = glob('combined/*wav')
for i in tqdm(wavs):
    t = '/'.join(i.split('<>')[1:])
    t = t.split('.wav')[0]
    t = t.replace('output-wav', 'output-text')
    with open(f'text-audiobook/{t}.wav.txt') as fopen:
        text = fopen.read()
    audiobook.append((i, text))

100%|██████████| 64341/64341 [00:01<00:00, 35056.61it/s]


In [12]:
df = pd.read_csv('haqkiem/metadata.csv', header = None, sep = '|')
txts = df.values.tolist()
haqkiem = []
for f in tqdm(txts[:-int(len(txts) * 0.05)]):
    text = f[1]
    text = text.split('.,,')[0]
    f = f[0]
    r = f'haqkiem/{f}.wav'
    haqkiem.append((r, text))

100%|██████████| 4080/4080 [00:00<00:00, 660775.36it/s]


In [19]:
sani = glob('clean/**/*.wav', recursive = True)
len(sani)

329515

In [20]:
sanis = []
for i in tqdm(sani):
    with open(i[:-4] + '.txt') as fopen:
        text = fopen.read()
    if len(text):
        sanis.append((i, text))
    
len(sanis)

100%|██████████| 329515/329515 [03:56<00:00, 1390.41it/s]


329359

In [21]:
audios = malays + wikipedia + news + audiobook + haqkiem + sanis
audios, texts = zip(*audios)

In [22]:
import unicodedata
import re
import itertools

def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [23]:
processed_text = [preprocessing_text(t) for t in tqdm(texts)]

100%|██████████| 1502637/1502637 [01:22<00:00, 18247.45it/s]


In [24]:
with open('bahasa-asr.json', 'w') as fopen:
    json.dump({'X': audios, 'Y':processed_text}, fopen)