In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import malaya_speech
import random
import numpy as np
from sklearn.utils import shuffle
from sklearn.utils.random import sample_without_replacement

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer')
singlish_model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer-singlish')

2023-03-12 13:03:16.009404: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-12 13:03:16.064381: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-12 13:03:16.067117: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-12 13:03:16.067938: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [5]:
import os
import json
from glob import glob
import unicodedata
import re
import itertools
import malaya
from tqdm import tqdm
from malaya.text.normalization import cardinal

tokenizer = malaya.tokenizer.Tokenizer(hypen = False, parliament = False, time = False, time_pukul = False,
                                      temperature = False, distance = False, volume = False, duration = False,
                                      weight = False, date = False, money = False)

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    tokenized = tokenizer.tokenize(string)
    string = ' '.join(tokenized)
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [6]:
with open('/home/husein/malaya-speech/singlish-stt-train.json') as fopen:
    sg = json.load(fopen)

In [7]:
base_directory = '/home/husein/ssd2'

files = glob('/home/husein/malaya/postprocessing*.json')
text = []
for f in files:
    with open(f) as fopen:
        text.extend(json.load(fopen))
        
azure = []
for t in tqdm(text):
    index = os.path.split(t[0])[1].replace('.json', '')
    wav = t[0].replace('-text', '-wav').replace('.json', '.wav')
    wav = os.path.join(base_directory, wav)
    if os.path.exists(wav):
        azure.append((wav, preprocessing_text(t[1])))

100%|████████████████████████████████| 319344/319344 [00:19<00:00, 16379.67it/s]


In [8]:
azure[:10]

[('/home/husein/ssd2/yasmin-news-edge-tts-wav/138.wav',
  'ketika perang aceh meletus pada tahun seribu lapan ratus tujuh puluh tiga teuku ibrahim lamnga aktif berjuang di garisan depan'),
 ('/home/husein/ssd2/yasmin-news-edge-tts-wav/32958.wav',
  'kementeri kesihatannya dalam kenyataan berkata mangsa saya amat naseer dua puluh dua terbunuh dalam serangan di utara gaza'),
 ('/home/husein/ssd2/yasmin-news-edge-tts-wav/10582.wav',
  'kedudukan dan statistik penyokong macedonian di belgrade arena'),
 ('/home/husein/ssd2/yasmin-news-edge-tts-wav/456.wav',
  'sebagai ahli perniagaan beliau memiliki aset aset perniagaan termasuk siti nurhaliza productions media massa simplysiti kosmetik dan creacion fesyen'),
 ('/home/husein/ssd2/yasmin-news-edge-tts-wav/18717.wav',
  'antara pokok utama ialah getah acacia buluh kelapa paya bakau dan kayu jati'),
 ('/home/husein/ssd2/yasmin-news-edge-tts-wav/30899.wav',
  'menurut politisi partai golkar tersebut beberapa sektor masih menjadi fokus pemerinta

In [9]:
with open('/home/husein/speech-bahasa/gtts-text.json') as fopen:
    texts = json.load(fopen)

In [10]:
wavenet = []
speakers = ['A', 'B', 'C', 'D']

for no, t in enumerate(texts):
    for s in speakers:
        wav = f'ms-MY-Wavenet-{s}/{no}.mp3'
        wav = os.path.join(base_directory, wav)
        if os.path.exists(wav):
            wavenet.append((wav, preprocessing_text(t[1])))
        
len(wavenet)

124276

In [11]:
import pandas as pd

df = pd.read_csv('/home/husein/ssd2/haqkiem/metadata.csv', header = None, sep = '|')
txts = df.values.tolist()

haqkiem = []
for f in txts:
    text = f[1]
    f = f[0]
    wav = f'haqkiem/{f}.wav'
    wav = os.path.join(base_directory, wav)
    
    text = text.split('.,,')[0]
    if len(re.findall(r'(RM \d+,\d+\.\d+|RM \d+\.\d+)', text)):
        continue

    text = preprocessing_text(text)
    if os.path.exists(wav):
        haqkiem.append((wav, text))
        
len(haqkiem)

4289

In [12]:
import random

clean = azure + wavenet + haqkiem
random.shuffle(clean)

In [13]:
from datasets import Audio

sr = 16000
minimum = int(0.3 * sr)
audio = Audio(sampling_rate=sr)

In [14]:
def groupby(alignment, length, min_threshold = 0.3):
    r = []
    g = []
    for no, row in enumerate(alignment):
        
        if no > 0 and len(r) and alignment[no]['start'] - alignment[no-1]['end'] >= min_threshold:
            g.append(r)
            r = []
        
        r.append(row)

    return g

In [15]:
directory = 'augmentation-switchboard-v4'
!mkdir {directory}

mkdir: cannot create directory ‘augmentation-switchboard-v4’: File exists


In [16]:
import torchaudio
import torch

In [18]:
asr = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')
_ = asr.cuda()
_ = asr.eval()

In [20]:
def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [None]:
data = {}
for O in tqdm(range(2000000)):
    try:
        i_ms = sample_without_replacement(len(clean), 3)
        i_sg = sample_without_replacement(len(sg['X']), 3)

        groups = []
        for i in i_ms:
            x = clean[i][0]
            v = clean[i][1]
            y = audio.decode_example(audio.encode_example(x))['array']
                
            result = model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            # print(grouped)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))

        for i in i_sg:
            x = sg['X'][i]
            v = sg['Y'][i]
            y = audio.decode_example(audio.encode_example(x))['array']
            asr_ = asr.forward([y])[0]
            
            if asr_[0][1][0] < 0.1 and calculate_wer(v, asr_[0][0]) > 0.1:
                # print(f'skip sg {i}', v, asr_[0][0], asr_[0][1][0])
                continue
                
            result = singlish_model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))

        groups = shuffle(groups)
        l = 0
        combine_y, combine_v = [], []
        index = 0
        while l < 15 and index < len(groups):
            l_ = len(groups[index][0]) / sr
            if l_ < 1.0:
                index += 1
                continue
            l += l_
            combine_y.append(groups[index][0] / np.abs(groups[index][0]).max())
            combine_v.extend(groups[index][1])
            index += 1

        if len(combine_v):
            audio_path = f'{directory}/{O}.mp3'
            torchaudio.save(audio_path, 
                            torch.tensor(np.concatenate(combine_y).astype('float32')).unsqueeze(0), 
                            sr, format='mp3')
            data[O] = ' '.join(combine_v)
    except Exception as e:
        print(e)

  0%|                                | 1497/2000000 [46:42<895:55:08,  1.61s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  0%|                              | 2320/2000000 [1:12:18<797:11:39,  1.44s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  1%|▏                            | 13053/2000000 [6:48:54<861:51:10,  1.56s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  2%|▍                           | 35069/2000000 [18:15:01<991:16:35,  1.82s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  2%|▌                           | 37830/2000000 [19:41:18<945:16:22,  1.73s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  3%|▋                           | 50718/2000000 [26:26:08<970:09:07,  1.79s/it]

In [26]:
len(data)

58241

In [28]:
data[0]

'a new television show documented the lives of various people including wong hong suen cuma tanggugjawab kita juga ialah membantu rakyat malaysia'

In [29]:
import IPython.display as ipd
ipd.Audio(f'{directory}/0.mp3')

In [27]:
with open('augment-switchboard-v4.json', 'w') as fopen:
    json.dump(data, fopen)