In [1]:
# make sure run prepare-malay-stt-train.ipynb first

In [2]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/speech/imda/part3-same-splitted.tar
# !tar -xf part3-same-splitted.tar

In [3]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/speech/imda/TTS.zip
# !unzip TTS.zip

In [4]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/speech/imda/part1.tar
# !tar -xf part1.tar
# !wget https://f000.backblazeb2.com/file/malay-dataset/speech/imda/SCRIPT.zip
# !unzip -o SCRIPT.zip -d WAVE-text
# !rm SCRIPT.zip

In [5]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [6]:
import pandas as pd
from glob import glob
from tqdm import tqdm
from unidecode import unidecode
import random
import json

In [7]:
base_directory = '/home/husein/speech-bahasa'

In [8]:
with open(f'{base_directory}/CHANNEL0/SCRIPT/FEMALE_01.txt') as fopen:
    texts = fopen.read().split('\n')

singlishs = []
for t in texts:
    splitted = t.split('\t')
    if len(splitted) == 2:
        singlishs.append((f'{base_directory}/CHANNEL0/WAVE/FEMALE_01/{splitted[0]}.wav', splitted[1]))
        
len(singlishs)

6033

In [9]:
wave_texts = glob('WAVE-text/*.TXT')

for f in tqdm(wave_texts):
    speaker = f.split('/')[1].replace('.TXT', '')
    channel = speaker[-1]
    speaker = speaker[1:-1]
    
    with open(f) as fopen:
        texts = list(filter(None, fopen.read().split('\n')))[::2]
    
    for text in texts:
        splitted = text.split('\t')
        wav = unidecode(splitted[0])
        path = f'{base_directory}/WAVE/SPEAKER{speaker}/SESSION{channel}/{wav}.WAV'
        
        if os.path.exists(path) and len(splitted[1]):
            singlishs.append((path, splitted[1]))
        else:
            print(splitted, path)
            pass

100%|██████████| 2034/2034 [00:04<00:00, 453.61it/s]


In [10]:
singlish = glob(f'{base_directory}/part3-splitted/wav/*.wav')
len(singlish)

1261328

In [11]:
def replace_paralinguistic(string, replaces = ['(ppb)', '(ppc)', '(ppl)', '(ppo)', '<UNK>', '<MANDARIN>']):
    for r in replaces:
        string = string.replace(r, ' ')
    string = string.split()
    string = [w for w in string if w[0] != '<' and w[-1] != '>']
    string = [w for w in string if w[0] != '[' and w[-1] != ']']
    return ' '.join(string)

for i in tqdm(singlish):
    try:
        p = i.replace('/wav','/text')
        with open(f'{p}.txt') as fopen:
            text = fopen.read()
        if len(text) < 2:
            continue
        if text[0] == '<' and text[-1] == '>':
            continue
        text = replace_paralinguistic(text)
        singlishs.append((i, text))
    except Exception as e:
        print(e)

100%|██████████| 1261328/1261328 [00:37<00:00, 33288.44it/s]


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
singlishs_train, singlishs_test = train_test_split(singlishs, test_size = 2000)

In [14]:
with open('singlish-mixed.json', 'w') as fopen:
    json.dump({'train': singlishs_train, 'test': singlishs_test}, fopen)

In [15]:
# import IPython.display as ipd

# ipd.Audio(audiobook[0][0])

In [37]:
audios = singlishs_train
audios, texts = zip(*audios)

In [38]:
len(audios), len(texts)

(1464595, 1464595)

In [18]:
import wordsegment
wordsegment.load()

In [19]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string)
    string = ' '.join(wordsegment.segment(string)).lower()
    string = string.replace('\'', '')
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [20]:
import mp

In [21]:
def loop(strings):
    strings, _ = strings
    results = []
    for i in tqdm(range(len(strings))):
        results.append((strings[i][0], preprocessing_text(strings[i][1])))
    return results

In [22]:
no_texts = [(i, texts[i]) for i in range(len(texts))]

In [23]:
loop((no_texts[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 43.99it/s]


[(0, 'happiest moment'),
 (1, 'yeah'),
 (2, 'this conversation so'),
 (3, 'we will still be alive'),
 (4, 'you talk to her is called emotional cheating'),
 (5,
  'in the past when only urine tests were used they had to be done once or twice a week'),
 (6,
  'country so far is one is in singapore at marina bay sands like usual another one is'),
 (7, 'i think the public nowadays is getting more and more education'),
 (8, 'is damn funny you know like or like when when dentist'),
 (9, 'ya')]

In [None]:
processed_text = mp.multiprocessing(no_texts, loop, cores = 20)

 73%|███████▎  | 53629/73229 [45:02<20:49, 15.69it/s]   

In [25]:
len(processed_text)

1464595

In [28]:
processed_text_sorted = sorted(processed_text, key=lambda x: x[0])

In [29]:
processed_text_sorted[:10]

[(0, 'happiest moment'),
 (1, 'yeah'),
 (2, 'this conversation so'),
 (3, 'we will still be alive'),
 (4, 'you talk to her is called emotional cheating'),
 (5,
  'in the past when only urine tests were used they had to be done once or twice a week'),
 (6,
  'country so far is one is in singapore at marina bay sands like usual another one is'),
 (7, 'i think the public nowadays is getting more and more education'),
 (8, 'is damn funny you know like or like when when dentist'),
 (9, 'ya')]

In [30]:
text_only = [i[1] for i in processed_text_sorted]
text_only[:10]

['happiest moment',
 'yeah',
 'this conversation so',
 'we will still be alive',
 'you talk to her is called emotional cheating',
 'in the past when only urine tests were used they had to be done once or twice a week',
 'country so far is one is in singapore at marina bay sands like usual another one is',
 'i think the public nowadays is getting more and more education',
 'is damn funny you know like or like when when dentist',
 'ya']

In [31]:
import malaya_speech

tokenizer = malaya_speech.subword.generate_tokenizer(text_only, max_subword_length = 3)
malaya_speech.subword.save(tokenizer, 'transducer-singlish.subword')

In [39]:
with open('bahasa-asr-train.json') as fopen:
    bahasa = json.load(fopen)
    
bahasa['X'][:1]

['/home/husein/speech-bahasa/cv-corpus-5.1-2020-06-22/id/clips/common_voice_id_20425643.mp3']

In [40]:
len(audios), len(text_only)

(1464595, 1464595)

In [41]:
audios = bahasa['X'] + list(audios)
processed_text = bahasa['Y'] + list(text_only)

In [42]:
with open('mixed-asr-train.json', 'w') as fopen:
    json.dump({'X': audios, 'Y':processed_text}, fopen)

In [43]:
tokenizer = malaya_speech.subword.generate_tokenizer(processed_text, max_subword_length = 3)

In [44]:
malaya_speech.subword.save(tokenizer, 'transducer-mixed.subword')

In [45]:
tokenizer = malaya_speech.subword.load('transducer-mixed.subword')
malaya_speech.subword.encode(tokenizer, 'i hate', add_blank = True)

[0, 2, 389, 862]

In [46]:
malaya_speech.subword.decode(tokenizer, [0, 2, 389, 862])

'i hate'

In [None]:
# from pydub import AudioSegment
# import numpy as np

# sr = 16000

# def mp3_to_wav(file, sr = sr):
#     audio = AudioSegment.from_file(file)
#     audio = audio.set_frame_rate(sr).set_channels(1)
#     sample = np.array(audio.get_array_of_samples())
#     return malaya_speech.astype.int_to_float(sample), sr

# def generator(maxlen = 18, min_length_text = 2):
#     for i in tqdm(range(len(audios))):
#         try:
#             if audios[i].endswith('.mp3'):
#                 wav_data, _ = mp3_to_wav(audios[i])
#             else:
#                 wav_data, _ = malaya_speech.load(audios[i])
                
#             if (len(wav_data) / sr) > maxlen:
#                 # print(f'skipped audio too long {audios[i]}')
#                 continue
                
#             if len(processed_text[i]) < min_length_text:
#                 print(f'skipped text too short {audios[i]}')
#                 continue    

#             yield {
#                 'waveforms': wav_data.tolist(),
#                 'waveform_lens': [len(wav_data)],
#                 'targets': malaya_speech.subword.encode(tokenizer, processed_text[i], add_blank = False),
#             }
#         except Exception as e:
#             print(e)
            
# generator = generator()

In [None]:
# import os
# import tensorflow as tf

# os.system('rm bahasa-asr/data/*')
# DATA_DIR = os.path.expanduser('bahasa-asr/data')
# tf.gfile.MakeDirs(DATA_DIR)

In [None]:
# shards = [{'split': 'train', 'shards': 1000}]

In [None]:
# import malaya_speech.train as train

# train.prepare_dataset(generator, DATA_DIR, shards, prefix = 'bahasa-asr')