In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
import malaya_speech
import random
import numpy as np
from sklearn.utils import shuffle
from sklearn.utils.random import sample_without_replacement

`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
malaya_speech.force_alignment.transducer.available_transformer()

Unnamed: 0,Size (MB),Quantized Size (MB),Language
conformer-transducer,120,32.3,[malay]
conformer-transducer-mixed,120,32.3,"[malay, singlish]"
conformer-transducer-singlish,120,32.3,[singlish]


In [4]:
model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer', device = 'gpu:0')
singlish_model = malaya_speech.force_alignment.transducer.transformer(model = 'conformer-transducer-singlish', device = 'gpu:0')

2023-03-11 13:23:48.431277: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-11 13:23:48.479569: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-11 13:23:48.482154: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-11 13:23:48.482959: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [5]:
asr = malaya_speech.stt.transducer.pt_transformer(model = 'mesolitica/conformer-medium-mixed')
_ = asr.cuda()
_ = asr.eval()

In [6]:
import json
from glob import glob
from tqdm import tqdm

with open('/home/husein/ssd1/speech-bahasa/malay-asr-train.json') as fopen:
    ms = json.load(fopen)

In [7]:
len(ms['X'])

1635599

In [8]:
with open('/home/husein/malaya-speech/singlish-stt-train.json') as fopen:
    sg = json.load(fopen)

In [9]:
len(sg['X'])

3284901

In [10]:
sg['X'][:10]

['/home/husein/ssd2/imda/wav/5-75-tfrecord-1409.wav',
 '/home/husein/ssd2/imda/wav/5-118-tfrecord-1786.wav',
 '/home/husein/ssd2/imda/wav/4-77-tfrecord-2083.wav',
 '/home/husein/ssd2/imda/wav/2-39-tfrecord-78.wav',
 '/home/husein/ssd2/imda/wav/7-63-tfrecord-730.wav',
 '/home/husein/ssd2/imda/wav/5-68-tfrecord-1531.wav',
 '/home/husein/ssd2/imda/wav/0-17-tfrecord-3636.wav',
 '/home/husein/ssd2/imda/wav/1-113-tfrecord-1496.wav',
 '/home/husein/ssd2/imda/wav/7-108-tfrecord-1945.wav',
 '/home/husein/ssd2/imda/wav/5-63-tfrecord-3858.wav']

In [11]:
sg['Y'][:10]

['and even like dogs at the shelter they just want to make you feel better',
 'so for our annual annual package right now we having promotion is just one thousand',
 'i still keep in contact with now',
 'is armenia a nice place',
 'yeah',
 'a normal hawker centre with hybrid stalls',
 'full tables of premiums can be found here',
 'suggestions okay',
 'okay so now to next call',
 'ok so she said that now the innisfree has the new orchid line which cost']

In [12]:
from datasets import Audio

sr = 16000
minimum = int(0.3 * sr)
audio = Audio(sampling_rate=sr)

In [13]:
def groupby(alignment, length, min_threshold = 0.3):
    r = []
    g = []
    for no, row in enumerate(alignment):
        
        if no > 0 and len(r) and alignment[no]['start'] - alignment[no-1]['end'] >= min_threshold:
            g.append(r)
            r = []
        
        r.append(row)

    return g

In [14]:
directory = 'augmentation-switchboard-v2'
!mkdir {directory}

In [15]:
import torchaudio
import torch

In [16]:
def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [23]:
data = {}
for O in tqdm(range(2000000)):
    try:
        i_ms = sample_without_replacement(len(ms['X']), 3)
        i_sg = sample_without_replacement(len(sg['X']), 3)

        groups = []
        for i in i_ms:
            x = ms['X'][i]
            v = ms['Y'][i]
            y = audio.decode_example(audio.encode_example(x))['array']
            asr_ = asr.forward([y])[0]
            
            if asr_[0][1][0] < 0.07 and calculate_wer(v, asr_[0][0]) > 0.1:
                # print(f'skip ms {i}', v, asr_[0][0], asr_[0][1][0])
                continue
                
            result = model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            # print(grouped)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))

        for i in i_sg:
            x = sg['X'][i]
            v = sg['Y'][i]
            y = audio.decode_example(audio.encode_example(x))['array']
            asr_ = asr.forward([y])[0]
            
            if asr_[0][1][0] < 0.1 and calculate_wer(v, asr_[0][0]) > 0.1:
                # print(f'skip sg {i}', v, asr_[0][0], asr_[0][1][0])
                continue
                
            result = singlish_model.predict(y, v)

            if ' '.join([r['text'] for r in result['words_alignment']]) != v:
                continue

            grouped = groupby(result['words_alignment'], len(y) / sr)
            for g in grouped:
                y_ = y[int(g[0]['start'] * sr): int(g[-1]['end'] * sr) + minimum]
                if len(y_):
                    v_ = [g_['text'] for g_ in g]
                    groups.append((y_, v_))

        groups = shuffle(groups)
        l = 0
        combine_y, combine_v = [], []
        index = 0
        while l < 15 and index < len(groups):
            l_ = len(groups[index][0]) / sr
            if l_ < 1.0:
                index += 1
                continue
            l += l_
            combine_y.append(groups[index][0] / np.abs(groups[index][0]).max())
            combine_v.extend(groups[index][1])
            index += 1

        if len(combine_v):
            audio_path = f'{directory}/{O}.mp3'
            torchaudio.save(audio_path, 
                            torch.tensor(np.concatenate(combine_y).astype('float32')).unsqueeze(0), 
                            sr, format='mp3')
            data[O] = ' '.join(combine_v)
    except Exception as e:
        print(e)

  0%|                                                                                         | 529/2000000 [19:22<1401:37:42,  2.52s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 160]


  0%|                                                                                      | 2116/2000000 [1:17:14<1104:34:42,  1.99s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 107]


  0%|▎                                                                                     | 8167/2000000 [4:58:05<1244:45:58,  2.25s/it]

2 root error(s) found.
  (0) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
	 [[import/non_blank_transcript/_691]]
  (1) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'import/floordiv_1':
  File "usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/husein/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/husein/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/hus

  0%|▎                                                                                     | 8658/2000000 [5:15:55<1240:38:56,  2.24s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  0%|▍                                                                                     | 9091/2000000 [5:31:46<1027:45:48,  1.86s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 124]


  1%|▌                                                                                     | 11665/2000000 [7:05:44<958:51:15,  1.74s/it]

CUDA out of memory. Tried to allocate 3.27 GiB (GPU 0; 23.69 GiB total capacity; 16.24 GiB already allocated; 560.62 MiB free; 19.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


  1%|▌                                                                                     | 14470/2000000 [8:48:48<976:31:06,  1.77s/it]

CUDA out of memory. Tried to allocate 1.62 GiB (GPU 0; 23.69 GiB total capacity; 17.42 GiB already allocated; 560.62 MiB free; 19.19 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


  1%|▊                                                                                   | 18498/2000000 [11:17:45<1095:22:58,  1.99s/it]

2 root error(s) found.
  (0) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
	 [[import/non_blank_transcript/_691]]
  (1) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'import/floordiv_1':
  File "usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/husein/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/husein/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/hus

  1%|▊                                                                                   | 20433/2000000 [12:29:57<1202:22:58,  2.19s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 133]


  1%|▉                                                                                   | 22735/2000000 [13:54:16<1060:43:14,  1.93s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 167]


  1%|█                                                                                    | 24082/2000000 [14:43:53<859:49:44,  1.57s/it]

2 root error(s) found.
  (0) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
	 [[import/non_blank_transcript/_691]]
  (1) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'import/floordiv_1':
  File "usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/husein/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/husein/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/hus

  1%|█                                                                                   | 24344/2000000 [14:53:38<1326:26:21,  2.42s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  1%|█                                                                                    | 24998/2000000 [15:17:34<878:28:24,  1.60s/it]

Calculated padded input size per channel: (0). Kernel size: (1). Kernel size can't be greater than actual input size


  1%|█▏                                                                                  | 27276/2000000 [16:41:10<1117:54:24,  2.04s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  2%|█▌                                                                                   | 35636/2000000 [21:47:04<902:00:42,  1.65s/it]

Argument #4: Padding size should be less than the corresponding input dimension, but got: padding (200, 200) at dimension 2 of input [1, 1, 196]


  2%|█▌                                                                                  | 35788/2000000 [21:52:30<1094:37:08,  2.01s/it]

2 root error(s) found.
  (0) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
	 [[import/non_blank_transcript/_691]]
  (1) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'import/floordiv_1':
  File "usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/husein/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/husein/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/hus

  2%|█▉                                                                                  | 46287/2000000 [29:00:50<1164:41:12,  2.15s/it]

2 root error(s) found.
  (0) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
	 [[import/non_blank_transcript/_691]]
  (1) Invalid argument: Integer division by zero
	 [[node import/floordiv_1 (defined at home/husein/.local/lib/python3.8/site-packages/malaya_boilerplate/frozen_graph.py:384) ]]
0 successful operations.
0 derived errors ignored.

Original stack trace for 'import/floordiv_1':
  File "usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "home/husein/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "home/husein/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "home/hus

  3%|██▊                                                                                 | 67811/2000000 [43:52:02<1249:56:36,  2.33s/it]


KeyboardInterrupt: 

In [24]:
len(data)

59802

In [25]:
data

{1: 'he also admitted tersebut berdasarkan peruntukan that more work',
 2: 'pada masa yang sama saya mohon but even without hidden surprises mereka nak promosikan the process of achieving',
 4: 'you must try b b q',
 5: 'wo zai wo de office ke yi dai laptop republic polytechnic villa francis home for the aged',
 6: 'tiada sehari pun had if i want to cancel my card at any point would there be any',
 7: 'chong kee hiong cari suasana yang lain suasana yang pada dulu kena ambil satu bosan',
 8: 'catastrophic both of these methods will have a',
 9: 'what do you like to do bekas plastik kecil digunakan bagi tujuan penanaman what can i use',
 11: 'to an unfaithful from his counselling experience and the thirty eight dollar which is the standard kita kena tulis he said people',
 12: 'mohammad nurrasyid sallallahu wasallam bersabda tak dia kata dia imam mahdi yang dinanti nantikan we will go from house to house and campaign orang duk ikut dan orang percaya dengan ajaran mana ada perempuan imam 

In [26]:
import IPython.display as ipd
ipd.Audio(f'{directory}/1.mp3')

In [27]:
with open('augment-switchboard-v2.json', 'w') as fopen:
    json.dump(data, fopen)