In [3]:
# !wget https://f000.backblazeb2.com/file/malaya-speech-model/data/Haqkiem_8hour_audio_text.zip
# !mkdir haqkiem
# !unzip Haqkiem_8hour_audio_text.zip -d haqkiem

In [4]:
import parselmouth
import librosa
import pyworld as pw
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import malaya_speech
from malaya_speech import Pipeline

In [5]:
import yaml

with open('config.yaml') as fopen:
    config = yaml.safe_load(fopen)
    
config

{'sampling_rate': 22050,
 'fft_size': 1024,
 'hop_size': 256,
 'win_length': None,
 'window': 'hann',
 'num_mels': 80,
 'fmin': 80,
 'fmax': 7600,
 'global_gain_scale': 1.0,
 'trim_silence': True,
 'trim_threshold_in_db': 60,
 'trim_frame_size': 2048,
 'trim_hop_size': 512}

In [6]:
import numpy as np

# https://github.com/TensorSpeech/TensorFlowTTS/blob/master/tensorflow_tts/utils/outliers.py
def is_outlier(x, p25, p75):
    """Check if value is an outlier."""
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    return x <= lower or x >= upper


def remove_outlier(x, p_bottom: int = 25, p_top: int = 75):
    """Remove outlier from x."""
    p_bottom = np.percentile(x, p_bottom)
    p_top = np.percentile(x, p_top)

    indices_of_outliers = []
    for ind, value in enumerate(x):
        if is_outlier(value, p_bottom, p_top):
            indices_of_outliers.append(ind)

    x[indices_of_outliers] = 0.0
    x[indices_of_outliers] = np.max(x)
    return x

In [7]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [8]:
def tts_encode(string: str, add_eos: bool = True):
    r = [MALAYA_SPEECH_SYMBOLS.index(c) for c in string if c in MALAYA_SPEECH_SYMBOLS]
    if add_eos:
        r = r + [MALAYA_SPEECH_SYMBOLS.index('eos')]
    return r

In [9]:
from unidecode import unidecode
import malaya

normalizer = malaya.normalize.normalizer(date = False, time = False, money = False)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = string.replace('&', ' dan ')
    string = re.sub(r'[ ]+', ' ', string).strip()
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    if normalize:
        string = normalizer.normalize(string, 
                                      check_english = False, 
                                      normalize_entity = False, 
                                      normalize_text = False,
                                      normalize_url = True,
                                      normalize_email = True,
                                      normalize_year = True)
        string = string['normalize']
    else:
        string = string
    string = put_spacing_num(string)
    string = ''.join([c for c in string if c in MALAYA_SPEECH_SYMBOLS])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string.lower()
    return string, tts_encode(string, add_eos = add_eos)

  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


In [10]:
import pandas as pd

df = pd.read_csv('haqkiem/metadata.csv', header = None, sep = '|')
df.head()

Unnamed: 0,0,1
0,LJ001-000001,Sultan Johor Sultan Ibrahim Iskandar selamat t...
1,LJ002-000001,Menerusi entri terbaharu dalam laman Facebook ...
2,LJ003-000001,Kepulangan Sultan Ibrahim disambut oleh Tunku ...
3,LJ004-000001,Sultan Ibrahim berlepas ke luar negara pada mi...
4,LJ005-000001,Kepulangan Sultan Ibrahim dijangka dapat menye...


In [11]:
df.shape

(4294, 2)

In [12]:
txts = df.values.tolist()
txts[0]

['LJ001-000001',
 'Sultan Johor Sultan Ibrahim Iskandar selamat tiba di Lapangan Terbang Antarabangsa Senai malam tadi.,,,,,,,,']

In [17]:
vad = malaya_speech.vad.webrtc()

def process(txts, silent_trail = 500, maxlen = 25):
    txts = txts[0]
    audios, mels, text_ids, f0s, energies, pitches = [], [], [], [], [], []

    for f in tqdm(txts):
        text = f[1]
        f = f[0]
        
        text = text.split('.,,')[0]
        if len(re.findall(r'(RM \d+,\d+\.\d+|RM \d+\.\d+)', text)):
            continue
            
        text = f'{text} .'
        text = cleaning(text)
        r = f'haqkiem/{f}.wav'

        audio, _ = malaya_speech.load(r, sr = config['sampling_rate'])

        if (len(audio) / config['sampling_rate']) > maxlen:
            print('skipped, audio too long')
            continue

        if config['trim_silence']:
            y_= malaya_speech.resample(audio, config['sampling_rate'], 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = list(malaya_speech.generator.frames(audio, 30, config['sampling_rate']))
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            grouped_deep = malaya_speech.group.group_frames_threshold(grouped_deep, 0.1)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-200:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:silent_trail]
                    else:
                        g = np.concatenate([g[0].array[:silent_trail], g[0].array[-silent_trail:]])
                        
                r.append(g)
            audio = np.concatenate(r)

        audio = np.pad(audio, (0, config["fft_size"]), mode="edge")
        audios.append(audio.astype(np.float32))
        text_ids.append(text)
    
    return [[audios, text_ids]]

In [15]:
import matplotlib.pyplot as plt
import IPython.display as ipd
from tqdm import tqdm

In [18]:
i = 135
r = process((txts[i: i + 10],))[0]

100%|███████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 70.48it/s]


In [19]:
k = 0

In [20]:
ipd.Audio(r[0][k], rate = 22050)

In [21]:
import mp

audios, text_ids = [], []

for i in range(0, len(txts), 1000):
    index = min(i + 1000, len(txts))
    b = txts[i: index]
    results = mp.multiprocessing(b, process, cores = 5, returned = True)
    for result in results:
        audios.extend(result[0])
        text_ids.extend(result[1])

100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 70.62it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 67.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 63.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 59.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 44.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 48.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 47.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 2

In [23]:
from glob import glob
import soundfile as sf
import numpy as np
import os

In [24]:
directory = 'haqkiem-audio'
directory = os.path.join(os.getcwd(), directory)
directory

'/home/ubuntu/speech-bahasa/haqkiem-audio'

In [25]:
!mkdir {directory}

In [26]:
len(df), len(audios)

(4294, 4289)

In [28]:
text_ids[0][0]

'sultan johor sultan ibrahim iskandar selamat tiba di lapangan terbang antarabangsa senai malam tadi .'

In [31]:
from tqdm import tqdm

haqkiem = []
for i in tqdm(range(len(audios))):
    filename = f'{i}.wav'
    left = os.path.join(directory, filename)
    sf.write(left, audios[i], 22050)
    
    haqkiem.append((left, text_ids[i][0]))

100%|██████████████████████████████████████████████████████████████████████████████████████| 4289/4289 [00:07<00:00, 548.85it/s]


In [33]:
import json

with open('haqkiem-vits.json', 'w') as fopen:
    json.dump(haqkiem, fopen)