In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import numpy as np
import malaya_speech.train as train
import malaya_speech.config
import malaya_speech
import tensorflow as tf
from glob import glob






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [3]:
import yaml

directory = '/home/husein/speech-bahasa'

with open(os.path.join(directory, 'config.yaml')) as fopen:
    config = yaml.load(fopen)
    
config

  


{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 20,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [4]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [5]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [6]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False, money = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True,
                                      normalize_year = True)
        string = string['normalize']
    else:
        string = string
    string = put_spacing_num(string)
    string = ''.join([c for c in string if c in MALAYA_SPEECH_SYMBOLS])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string.lower()
    return string, tts_encode(string, add_eos = add_eos)

In [7]:
import pandas as pd

df = pd.read_csv(os.path.join(directory, 'haqkiem/metadata.csv'), header = None, sep = '|')
txts = df.values.tolist()

In [8]:
f = txts[0]
text = f[1]
text = text.split('.,,')[0]
text = f'{text} .'
f = f[0]

In [9]:
r = os.path.join(directory, 'haqkiem', f'{f}.wav')
audio, _ = malaya_speech.load(r, sr = config['sampling_rate'])

In [10]:
# string, cleaning(text)

In [11]:
universal_mel = malaya_speech.featurization.universal_mel(audio)

In [12]:
import matplotlib.pyplot as plt

nrows = 2
fig, ax = plt.subplots(nrows = nrows, ncols = 1)
fig.set_figwidth(10)
fig.set_figheight(nrows * 3)
mel_outputs_ = np.reshape(universal_mel, [-1, 80])
im = ax[0].imshow(np.rot90(mel_outputs_), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax[0])
ax[1].plot(audio)
plt.show()

<Figure size 1000x600 with 3 Axes>

In [13]:
dataset = []
for f in txts:
    text = f[1]
    text = text.split('.,,')[0]
    text = f'{text} .'
    f = f[0]
    r = os.path.join(directory, 'haqkiem', f'{f}.wav')
    dataset.append((r, text))
dataset[:10]

[('/home/husein/speech-bahasa/haqkiem/LJ001-000001.wav',
  'Sultan Johor Sultan Ibrahim Iskandar selamat tiba di Lapangan Terbang Antarabangsa Senai malam tadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ002-000001.wav',
  'Menerusi entri terbaharu dalam laman Facebook rasminya Sultan Ibrahim tiba pada jam lapan sepuluh malam tadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ003-000001.wav',
  'Kepulangan Sultan Ibrahim disambut oleh Tunku Bendahara Johor Tunku Abdul Majid Idris Iskandar .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ004-000001.wav',
  'Sultan Ibrahim berlepas ke luar negara pada minggu lalu atas lawatan peribadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ005-000001.wav',
  'Kepulangan Sultan Ibrahim dijangka dapat menyelesaikan polemik jawatan Menteri Besar .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ006-000001.wav',
  'Tun Dr Mahathir Mohamad memaklumkan Datuk Osman Sapian sudah meletakkan jawatan berkuat kuasa Isnin lalu .'),
 ('/home/husein/speech-bahasa/haqkiem/

In [14]:
import json

with open(os.path.join(directory, 'true-case-female.json')) as fopen:
    txts = json.load(fopen)
    
for t in txts:
    f = t[0]
    f = f.replace('../youtube/malay/', '').replace('../youtube/malay2/', '')
    if re.match('^.*(RM|rm)[0-9]+.*$', text):
        continue
    r = f.replace('output-text', 'output-wav').replace('.txt', '')
    r = os.path.join(directory, r)
    dataset.append((r, t[2]))

In [15]:
dataset[-1]

('/home/husein/speech-bahasa/salina-short/output-wav-salina/dua-puluh-tujuh-28.mp3-735.wav',
 'Sehingga menampakkan bentuk badannya yang menggiurkan.')

In [16]:
with open(os.path.join(directory, 'true-case-male.json')) as fopen:
    txts = json.load(fopen)
    
for t in txts:
    f = t[0]
    f = f.replace('../youtube/malay/', '').replace('../youtube/malay2/', '')
    if re.match('^.*(RM|rm)[0-9]+.*$', text):
        continue
    r = f.replace('output-text', 'output-wav').replace('.txt', '')
    r = os.path.join(directory, r)
    dataset.append((r, t[2]))

In [17]:
dataset[-1]

('/home/husein/speech-bahasa/dari-pasentran-ke-istana-short/output-wav-dari-pasentran-ke-istana/islam-politik-dan-pilihanraya-10.mp3-1579.wav',
 'Dapat dirasakan')

In [18]:
with open(os.path.join(directory, 'transcript-news.json')) as fopen:
    transcribe = json.load(fopen)
    
audios = glob(os.path.join(directory, 'audio/*.wav'))

txts = []
for f in audios:
    t = transcribe[int(f.split('/')[-1].replace('.wav', ''))]
    txts.append(t)
    
news = list(zip(audios, txts))

In [21]:
audios = glob(os.path.join(directory, 'audio-iium/*.wav'))
with open(os.path.join(directory, 'shuffled-iium.json')) as fopen:
    transcribe = json.load(fopen)
    
txts = []
for f in audios:
    t = transcribe[int(f.split('/')[-1].replace('.wav', ''))]
    txts.append(t)
    
iium = list(zip(audios, txts))

In [24]:
audios = glob(os.path.join(directory, 'audio-wattpad/*.wav'))
with open(os.path.join(directory, 'transcript-wattpad.json')) as fopen:
    transcribe = json.load(fopen)
    
txts = []
for f in audios:
    t = transcribe[int(f.split('/')[-1].replace('.wav', ''))]
    txts.append(t)
    
wattpad = list(zip(audios, txts))

In [26]:
dataset.extend(news)
dataset.extend(iium)
dataset.extend(wattpad)

In [27]:
len(dataset)

84437

In [28]:
dataset[:10]

[('/home/husein/speech-bahasa/haqkiem/LJ001-000001.wav',
  'Sultan Johor Sultan Ibrahim Iskandar selamat tiba di Lapangan Terbang Antarabangsa Senai malam tadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ002-000001.wav',
  'Menerusi entri terbaharu dalam laman Facebook rasminya Sultan Ibrahim tiba pada jam lapan sepuluh malam tadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ003-000001.wav',
  'Kepulangan Sultan Ibrahim disambut oleh Tunku Bendahara Johor Tunku Abdul Majid Idris Iskandar .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ004-000001.wav',
  'Sultan Ibrahim berlepas ke luar negara pada minggu lalu atas lawatan peribadi .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ005-000001.wav',
  'Kepulangan Sultan Ibrahim dijangka dapat menyelesaikan polemik jawatan Menteri Besar .'),
 ('/home/husein/speech-bahasa/haqkiem/LJ006-000001.wav',
  'Tun Dr Mahathir Mohamad memaklumkan Datuk Osman Sapian sudah meletakkan jawatan berkuat kuasa Isnin lalu .'),
 ('/home/husein/speech-bahasa/haqkiem/

In [30]:
with open('force-alignment-malay-tts-dataset.json', 'w') as fopen:
    json.dump(dataset, fopen)