In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [3]:
import malaya_speech
import random
import numpy as np
import malaya
from sklearn.utils import shuffle
from sklearn.utils.random import sample_without_replacement

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [4]:
np.zeros(shape = (random.randint(1000, 16000),))

array([0., 0., 0., ..., 0., 0., 0.])

In [5]:
malaya_speech.force_alignment.transducer.available_transformer()

Unnamed: 0,Size (MB),Quantized Size (MB),Language
conformer-transducer,120,32.3,[malay]
conformer-transducer-mixed,120,32.3,"[malay, singlish]"
conformer-transducer-singlish,120,32.3,[singlish]


In [6]:
model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer', device = 'gpu:0')
singlish_model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer-singlish', device = 'gpu:0')

2023-03-24 23:25:06.678091: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-24 23:25:06.699786: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-24 23:25:06.711945: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-24 23:25:06.712805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [7]:
asr = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')
_ = asr.cuda()
_ = asr.eval()

In [8]:
lm = malaya.language_model.kenlm(model = 'bahasa-wiki-news-iium-stt')

In [9]:
lm.score('tak suka hg')

-13.193544387817383

In [10]:
import json
from glob import glob
from tqdm import tqdm

with open('/home/husein/ssd1/speech-bahasa/malay-asr-train.json') as fopen:
    ms = json.load(fopen)

In [11]:
len(ms['X'])

1635599

In [12]:
with open('/home/husein/malaya-speech/singlish-stt-train.json') as fopen:
    sg = json.load(fopen)

In [13]:
len(sg['X'])

3284901

In [14]:
sg['X'][:10]

['/home/husein/ssd2/imda/wav/5-75-tfrecord-1409.wav',
 '/home/husein/ssd2/imda/wav/5-118-tfrecord-1786.wav',
 '/home/husein/ssd2/imda/wav/4-77-tfrecord-2083.wav',
 '/home/husein/ssd2/imda/wav/2-39-tfrecord-78.wav',
 '/home/husein/ssd2/imda/wav/7-63-tfrecord-730.wav',
 '/home/husein/ssd2/imda/wav/5-68-tfrecord-1531.wav',
 '/home/husein/ssd2/imda/wav/0-17-tfrecord-3636.wav',
 '/home/husein/ssd2/imda/wav/1-113-tfrecord-1496.wav',
 '/home/husein/ssd2/imda/wav/7-108-tfrecord-1945.wav',
 '/home/husein/ssd2/imda/wav/5-63-tfrecord-3858.wav']

In [15]:
from datasets import Audio

sr = 16000
minimum = int(0.3 * sr)
audio = Audio(sampling_rate=sr)

In [16]:
def groupby(alignment, length, min_threshold = 0.3):
    r = []
    g = []
    for no, row in enumerate(alignment):
        
        if no > 0 and len(r) and alignment[no]['start'] - alignment[no-1]['end'] >= min_threshold:
            g.append(r)
            r = []
        
        r.append(row)

    return g

In [17]:
directory = 'augmentation-switchboard-v9'
!mkdir {directory}

In [18]:
import torchaudio
import torch
from itertools import permutations

In [19]:
def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [None]:
data = {}
for O in tqdm(range(2000000)):
    try:
        i_ms = sample_without_replacement(len(ms['X']), 4)
        i_sg = sample_without_replacement(len(sg['X']), 4)

        groups = []
        for i in i_ms:
            x = ms['X'][i]
            v = ms['Y'][i]
            y = audio.decode_example(audio.encode_example(x))['array']
            asr_ = asr.forward([y])[0]
            
            if asr_[0][1][0] < 0.15 and calculate_wer(v, asr_[0][0]) > 0.1:
                # print(f'skip ms {i}', v, asr_[0][0], asr_[0][1][0])
                continue
                
            result = model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            # print(grouped)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))

        for i in i_sg:
            x = sg['X'][i]
            v = sg['Y'][i]
            y = audio.decode_example(audio.encode_example(x))['array']
            asr_ = asr.forward([y])[0]
            
            if asr_[0][1][0] < 0.15 and calculate_wer(v, asr_[0][0]) > 0.1:
                # print(f'skip sg {i}', v, asr_[0][0], asr_[0][1][0])
                continue
                
            result = singlish_model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))
                    
        if not len(groups):
            continue
            
        groups = sorted(groups, key = lambda x: len(' '.join(x[1])), reverse = True)[:7]
        
        strings = [' '.join(g[1]) for g in groups]
        perm = list(permutations(range(len(groups))))
        
        scores = []
        for p in perm:
            s = ' '.join([strings[p_] for p_ in p])
            scores.append(lm.score(s))
            
        best_perm = perm[np.argmax(scores)]
        
        l = 0
        combine_y, combine_v = [], []
        index = 0
        while l < 15 and index < len(groups):
            g = groups[best_perm[index]]
            l_ = len(g[0]) / sr
            if l_ < 1.0:
                index += 1
                continue
            r_int = random.randint(3000, 16000)
            l += l_ + (r_int / sr)
            combine_y.append(g[0] / np.abs(g[0]).max())
            combine_y.append(np.zeros(shape = (r_int,)))
            combine_v.extend(g[1])
            index += 1

        if len(combine_v):
            audio_path = f'{directory}/{O}.mp3'
            torchaudio.save(audio_path, 
                            torch.tensor(np.concatenate(combine_y).astype('float32')).unsqueeze(0), 
                            sr, format='mp3')
            data[O] = ' '.join(combine_v)
            
    except Exception as e:
        print(e)

  0%|                             | 5764/2000000 [4:20:37<1428:06:59,  2.58s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  0%|                             | 6849/2000000 [5:09:27<1502:54:38,  2.71s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  1%|▏                           | 12200/2000000 [9:12:11<1164:13:35,  2.11s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 160]


  1%|▏                          | 14442/2000000 [10:53:38<1556:30:53,  2.82s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  1%|▎                          | 21101/2000000 [15:57:20<1356:53:44,  2.47s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  1%|▎                          | 27360/2000000 [20:45:33<1279:54:12,  2.34s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 160]


  1%|▎                          | 27648/2000000 [20:58:45<1165:42:46,  2.13s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  2%|▍                          | 30144/2000000 [22:54:21<1530:02:08,  2.80s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 14]


  2%|▍                          | 31976/2000000 [24:18:33<1307:22:41,  2.39s/it]

In [52]:
len(data)

5

In [53]:
data

{0: 'jamiyah halfway house darul islah presbyterian church',
 1: 'selalu kerap berlaku dan taspos ini selain daripada kemudahan seharusnya bermesyuarat they got out of the car and took a lot of pictures as they hiked up a gentle slope towards the top ayat tentang para pengikut',
 2: 'harum manis when you kepada masyarakat maklumat yang betul',
 3: 'seventeen three u two nine zero five number sequence is it will then later take off its about fifty four minutes walk and date of birth is ten perdana menteri telah',
 4: 'punya sistem perancangan berdasarkan bukti badan bayi think i would find would be her so i just went down to her just sitting there just crying her eyes out and im like your two years of your army experience can be converted kaya nombor satu tidak ada masalah'}

In [55]:
import IPython.display as ipd
ipd.Audio(f'{directory}/2.mp3')

In [None]:
directory

In [None]:
with open('augment-switchboard-v8.json', 'w') as fopen:
    json.dump(data, fopen)