In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [18]:
from huggingface_hub import upload_file
import tensorflow as tf
import malaya_speech

In [3]:
import numpy as np
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [6]:
tf.train.Example, tf.train.Features, tf.io.TFRecordWriter

(tensorflow.core.example.example_pb2.Example,
 tensorflow.core.example.feature_pb2.Features,
 tensorflow.python.lib.io.tf_record.TFRecordWriter)

In [7]:
from glob import glob
from tqdm import tqdm
import json
import string

In [23]:
char_vocabs = [''] + list(string.ascii_lowercase + string.digits) + [' ']
len(char_vocabs)

38

In [9]:
sani = glob('data/raw/clean/**/*.wav', recursive = True)
len(sani)

329515

In [10]:
sanis = []
for i in tqdm(sani):
    with open(i[:-4] + '.txt') as fopen:
        text = fopen.read()
    if len(text):
        sanis.append((i, text))
    
len(sanis)

100%|██████████| 329515/329515 [00:02<00:00, 113178.92it/s]


329359

In [11]:
audios, texts = zip(*sanis)

In [12]:
import unicodedata
import re
import itertools

def preprocessing_text(string):
        
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [13]:
processed_text = [preprocessing_text(t) for t in tqdm(texts)]

100%|██████████| 329359/329359 [00:05<00:00, 58978.61it/s]


In [14]:
files = list(zip(audios, processed_text))

In [15]:
files[:2]

[('data/raw/clean/speech_done/cardock/7fe328841e81e17b4e855cd5e9e32294_445.wav',
  'bunyi dia tak pecah jom kita tukar masih lagi menggunakan mak yang sama tanpa'),
 ('data/raw/clean/speech_done/cardock/fd2ffe802733a167a8ab03205fc55f0f_60.wav',
  'apabila kehilangan seorang sepupu dan sahabat akibat pandemik itu hospital serdang')]

In [16]:
sr = 16000
maxlen = 18
maxlen_subwords = 100
minlen_text = 1
global_count = 0

In [26]:
import re

def loop(files):
    
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            t = s[1]
            f = s[0]
            if len(s[1]) < minlen_text:
                continue
            y, _ = malaya_speech.load(f)
            if (len(y) / sr) > maxlen:
                continue
            
            t = ''.join([c if c in char_vocabs else ' ' for c in t])
            t = re.sub(r'[ ]+', ' ', t).strip()
            
            new_t = [char_vocabs.index(c) for c in t]
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': new_t, 
                                  'targets_length': [len(new_t)],
                                 'lang': [0]})
            writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            pass
    writer.close()
    while True:
        try:
            upload_file(path_or_fileobj=output_file,
                            path_in_repo=output_file,
                            repo_id='huseinzol05/Khursani-Malay-TFRecord')
            break
        except Exception as e:
            print(e)
    os.system(f'rm {output_file}')

In [29]:
import mp

batch_size = 20000
for i in range(0, len(files), batch_size):
    batch = files[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 8, returned = False)
    global_count += 1

100%|██████████| 2500/2500 [00:28<00:00, 88.33it/s] 
100%|██████████| 2500/2500 [00:28<00:00, 87.23it/s] 
100%|██████████| 2500/2500 [00:29<00:00, 86.07it/s] 
100%|██████████| 2500/2500 [00:29<00:00, 85.33it/s] 
100%|██████████| 2500/2500 [00:30<00:00, 82.10it/s] 
100%|██████████| 2500/2500 [00:30<00:00, 82.10it/s] 
100%|██████████| 2500/2500 [00:30<00:00, 81.91it/s] 
100%|██████████| 2500/2500 [00:30<00:00, 81.06it/s] 
100%|██████████| 2500/2500 [00:24<00:00, 101.57it/s]
100%|██████████| 2500/2500 [00:24<00:00, 100.67it/s]
100%|██████████| 2500/2500 [00:25<00:00, 99.85it/s] 
100%|██████████| 2500/2500 [00:25<00:00, 99.85it/s]
100%|██████████| 2500/2500 [00:25<00:00, 99.47it/s] 
100%|██████████| 2500/2500 [00:25<00:00, 99.39it/s] 
100%|██████████| 2500/2500 [00:25<00:00, 96.99it/s] 
100%|██████████| 2500/2500 [00:25<00:00, 96.19it/s] 
100%|██████████| 2500/2500 [00:24<00:00, 100.30it/s]
100%|██████████| 2500/2500 [00:24<00:00, 100.04it/s]
100%|██████████| 2500/2500 [00:24<00:00, 100.03

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 2500/2500 [00:23<00:00, 106.06it/s]
100%|██████████| 2500/2500 [00:23<00:00, 104.63it/s]
100%|██████████| 2500/2500 [00:23<00:00, 104.48it/s]
100%|██████████| 2500/2500 [00:24<00:00, 103.88it/s]
100%|██████████| 2500/2500 [00:24<00:00, 103.30it/s]
100%|██████████| 2500/2500 [00:24<00:00, 103.17it/s]
100%|██████████| 2500/2500 [00:24<00:00, 102.40it/s]
100%|██████████| 2500/2500 [00:24<00:00, 101.87it/s]
100%|██████████| 2500/2500 [00:49<00:00, 50.44it/s] 
100%|██████████| 2500/2500 [00:49<00:00, 50.35it/s] 
100%|██████████| 2500/2500 [00:49<00:00, 50.33it/s] 
100%|██████████| 2500/2500 [00:49<00:00, 50.19it/s] 
100%|██████████| 2500/2500 [00:49<00:00, 50.17it/s] 
100%|██████████| 2500/2500 [00:49<00:00, 50.04it/s] 
100%|██████████| 2500/2500 [00:50<00:00, 49.20it/s] 
100%|██████████| 2500/2500 [00:51<00:00, 48.99it/s]


('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 2500/2500 [00:35<00:00, 70.03it/s] 
100%|██████████| 2500/2500 [00:35<00:00, 69.94it/s]
100%|██████████| 2500/2500 [00:35<00:00, 69.55it/s] 
100%|██████████| 2500/2500 [00:36<00:00, 69.22it/s] 
100%|██████████| 2500/2500 [00:36<00:00, 68.95it/s] 
100%|██████████| 2500/2500 [00:37<00:00, 66.04it/s] 
100%|██████████| 2500/2500 [00:39<00:00, 63.63it/s] 
100%|██████████| 2500/2500 [00:39<00:00, 62.87it/s] 
100%|██████████| 2500/2500 [00:31<00:00, 78.34it/s] 
100%|██████████| 2500/2500 [00:35<00:00, 71.42it/s]]
100%|██████████| 2500/2500 [00:35<00:00, 71.04it/s] 
100%|██████████| 2500/2500 [00:35<00:00, 69.55it/s] 
100%|██████████| 2500/2500 [00:36<00:00, 69.23it/s]]
100%|██████████| 2500/2500 [00:36<00:00, 68.78it/s] 
100%|██████████| 2500/2500 [00:36<00:00, 68.24it/s] 
100%|██████████| 2500/2500 [00:36<00:00, 67.89it/s] 
100%|██████████| 2500/2500 [00:29<00:00, 85.67it/s]]
100%|██████████| 2500/2500 [00:30<00:00, 80.74it/s]]
100%|██████████| 2500/2500 [00:31<00:00, 80.26i

In [32]:
# !wget https://huggingface.co/huseinzol05/Khursani-Malay-TFRecord/resolve/main/0-0.tfrecord

In [33]:
def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
        'lang': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'waveforms_length', 'targets']:
            features.pop(k, None)

    return features

In [36]:
tfrecords = ['0-0.tfrecord']
num_cpu_threads = 2
thread_count = 2

In [37]:
d = tf.data.Dataset.from_tensor_slices(tf.constant(tfrecords))
d = d.shuffle(buffer_size=len(tfrecords))
cycle_length = min(num_cpu_threads, len(tfrecords))
d = d.interleave(
    tf.data.TFRecordDataset,
    cycle_length=cycle_length,
    block_length=thread_count)
d = d.shuffle(buffer_size=100)
d = d.map(parse, num_parallel_calls=num_cpu_threads)

2022-05-24 17:46:43.786844: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-05-24 17:46:43.787482: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop
2022-05-24 17:46:43.787545: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop
2022-05-24 17:46:43.788308: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2022-05-24 17:46:43.789095: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6
2022-05-24 17:46:43.797064: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F 

In [38]:
d = d.as_numpy_iterator()

2022-05-24 17:46:54.210549: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [39]:
r = next(d)

In [42]:
''.join([char_vocabs[t] for t in r['targets']])

'mereka ini kau nak buat defisit kalian apa semua'