In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [2]:
from glob import glob
import tensorflow as tf
from tqdm import tqdm
import malaya_speech
from malaya_speech.utils import subword
import numpy as np
import mp
from google.cloud import storage

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [3]:
singlish = glob('part4-diff-room/wav/*.wav')
len(singlish)

489837

In [4]:
subwords = subword.load('transducer-singlish.subword')

In [5]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = string.replace('\'', '')
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [6]:
def get_after_mandarin(word):
    if '<mandarin>' in word:
        w = word.split('>')[1].split(':')[1]
        return w.split('</')[0]
    else:
        return word
    
def get_before_mandarin(word):
    if '</mandarin>' in word:
        return word.split('</')[0]
    else:
        return word

def replace_paralinguistic(string, replaces = ['(ppb)', '(ppc)', '(ppl)', '(ppo)', '<UNK>', '<MANDARIN>']):
    for r in replaces:
        string = string.replace(r, ' ')
    string = string.split()
    string = [get_after_mandarin(w) for w in string]
    string = [get_before_mandarin(w) for w in string]
    string = [w for w in string if w[0] not in '<[(' and w[-1] not in '>])']
    return ' '.join(string)

In [7]:
def loop(files):
    files, index = files
    results = []
    for i in tqdm(files):
        try:
            p = i.replace('/wav','/text')
            with open(f'{p}.txt') as fopen:
                text = fopen.read()
            if len(text) < 2:
                continue
            if text[0] == '<' and text[-1] == '>':
                continue
            text = replace_paralinguistic(text)
            text = preprocessing_text(text)
            if len(text):
                results.append((i, text))
        except Exception as e:
            pass
    return results

In [8]:
loop((singlish[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 943.98it/s]


[('part4-diff-room/wav/sur_0755_2510_phnd_cs-tml-0-230.wav', 'doubt irukku'),
 ('part4-diff-room/wav/sur_1183_3366_phnd_cs-chn-0-433.wav', 'so'),
 ('part4-diff-room/wav/sur_0983_2967_phnd_cs-chn-0-1354.wav',
  'like like ni ming tian yao qu club then ni today need to go like testing'),
 ('part4-diff-room/wav/sur_0010_1020_phnd_cs-chn-0-146.wav',
  'you need to turn off your phone vibrations dude')]

In [9]:
singlishs = mp.multiprocessing(singlish, loop, cores = 12)

100%|██████████| 40819/40819 [03:08<00:00, 216.96it/s] 
100%|██████████| 40819/40819 [03:07<00:00, 217.49it/s]
100%|██████████| 9/9 [00:00<00:00, 163.71it/s].40it/s]
100%|██████████| 40819/40819 [03:08<00:00, 216.65it/s]
100%|██████████| 40819/40819 [03:08<00:00, 217.00it/s]
100%|██████████| 40819/40819 [03:09<00:00, 215.94it/s]
100%|██████████| 40819/40819 [03:08<00:00, 216.17it/s]
100%|██████████| 40819/40819 [03:10<00:00, 214.58it/s] 
100%|██████████| 40819/40819 [03:10<00:00, 214.61it/s] 
100%|██████████| 40819/40819 [03:10<00:00, 214.41it/s] 
100%|██████████| 40819/40819 [03:10<00:00, 213.80it/s] 
100%|██████████| 40819/40819 [03:11<00:00, 213.44it/s] 
100%|██████████| 40819/40819 [03:11<00:00, 213.37it/s] 


In [10]:
len(singlishs)

188242

In [11]:
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [12]:
sr = 16000
maxlen = 18
minlen_text = 1
global_count = 0

In [13]:
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            if len(s[1]) < minlen_text:
                continue
            y, _ = malaya_speech.load(s[0])
            if (len(y) / sr) > maxlen:
                continue
            t = subword.encode(subwords, s[1], add_blank=False)
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': t, 
                                  'targets_length': [len(t)]})
            writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            pass
    writer.close()
    blob = bucket.blob(f'imda/part4-diff-room/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [14]:
loop((singlishs[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 33.39it/s]


In [15]:
batch_size = 25000
for i in range(0, len(singlishs), batch_size):
    batch = singlishs[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 6, returned = False)
    global_count += 1

  1%|▏         | 58/4166 [00:02<02:28, 27.73it/s]

zero-size array to reduction operation maximum which has no identity


 29%|██▉       | 1211/4166 [01:00<02:41, 18.29it/s]

zero-size array to reduction operation maximum which has no identity

 30%|███       | 1264/4166 [01:00<02:24, 20.14it/s]




100%|██████████| 4166/4166 [03:19<00:00, 20.89it/s]
100%|██████████| 4166/4166 [03:19<00:00, 20.83it/s]
100%|██████████| 4166/4166 [03:21<00:00, 20.68it/s]
100%|██████████| 4166/4166 [03:23<00:00, 20.49it/s]
100%|██████████| 4166/4166 [03:23<00:00, 20.47it/s]
100%|██████████| 4166/4166 [03:26<00:00, 20.21it/s]
100%|██████████| 4/4 [00:00<00:00, 28.54it/s]
  8%|▊         | 352/4166 [00:14<02:08, 29.80it/s]

zero-size array to reduction operation maximum which has no identity


 77%|███████▋  | 3198/4166 [02:14<00:41, 23.58it/s]

zero-size array to reduction operation maximum which has no identity


 98%|█████████▊| 4079/4166 [02:50<00:03, 27.10it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [02:52<00:00, 24.14it/s]
100%|██████████| 4166/4166 [02:53<00:00, 23.99it/s]
100%|██████████| 4166/4166 [02:54<00:00, 23.93it/s]
100%|██████████| 4166/4166 [02:54<00:00, 23.88it/s]
100%|██████████| 4166/4166 [02:54<00:00, 23.88it/s]
100%|██████████| 4166/4166 [02:56<00:00, 23.66it/s]
100%|██████████| 4/4 [00:00<00:00, 58.06it/s]
100%|██████████| 4166/4166 [02:50<00:00, 24.39it/s]
100%|██████████| 4166/4166 [02:52<00:00, 24.17it/s]
100%|██████████| 4166/4166 [02:56<00:00, 23.64it/s]
100%|██████████| 4166/4166 [02:56<00:00, 23.61it/s]
100%|██████████| 4166/4166 [02:56<00:00, 23.59it/s]
100%|██████████| 4166/4166 [02:57<00:00, 23.48it/s]
100%|██████████| 4/4 [00:00<00:00, 53.25it/s]
  1%|          | 24/4166 [00:01<03:56, 17.49it/s]

zero-size array to reduction operation maximum which has no identity


 47%|████▋     | 1941/4166 [01:24<01:32, 24.16it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 4166/4166 [02:57<00:00, 23.44it/s]
100%|██████████| 4166/4166 [02:58<00:00, 23.39it/s]
100%|██████████| 4166/4166 [02:58<00:00, 23.37it/s]
100%|██████████| 4166/4166 [02:59<00:00, 23.27it/s]
100%|██████████| 4/4 [00:00<00:00, 54.93it/s]
 12%|█▏        | 501/4166 [00:22<02:03, 29.79it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (se

zero-size array to reduction operation maximum which has no identity


 58%|█████▊    | 2406/4166 [01:38<01:38, 17.90it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 19%|█▉        | 787/4166 [00:33<02:46, 20.30it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 61%|██████    | 2522/4166 [01:42<00:49, 33.27it/s]

zero-size array to reduction operation maximum which has no identity


 63%|██████▎   | 2617/4166 [01:49<01:09, 22.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 54%|█████▍    | 1198/2207 [00:50<00:41, 24.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 2207/2207 [01:27<00:00, 25.21it/s]
100%|██████████| 2207/2207 [01:29<00:00, 24.68it/s]
100%|██████████| 2207/2207 [01:29<00:00, 24.60it/s]
100%|██████████| 2207/2207 [01:30<00:00, 24.36it/s]
100%|██████████| 2207/2207 [01:30<00:00,

In [16]:
from malaya_speech.utils import tf_featurization

config = malaya_speech.config.transducer_featurizer_config
featurizer = tf_featurization.STTFeaturizer(**config)

In [17]:
n_mels = 80

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    example['inputs'] = mel_fbanks
    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
        
    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

def get_dataset(files, batch_size = 2, shuffle_size = 32, thread_count = 24):
    def get():
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.shuffle(shuffle_size)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.repeat()
        return dataset

    return get

In [21]:
files = tf.io.gfile.glob('gs://mesolitica-tpu-general/imda/part4-diff-room/*.tfrecord')
d = get_dataset(files)()
d = d.as_numpy_iterator()

In [23]:
next(d)

{'targets': array([ 99, 795,  31, 795,  78,   2,  59, 405,  22,   1, 795, 214, 556,
         19,  38, 795,  31, 795,   7, 362, 243, 248]),
 'targets_length': array([22]),
 'waveforms': array([0.00248065, 0.00290834, 0.00205295, ..., 0.00521791, 0.00560284,
        0.00577392], dtype=float32),
 'inputs': array([[-1.9488629 , -2.057597  , -2.8818624 , ..., -0.86949223,
         -0.9249324 , -0.78570694],
        [-2.5919015 , -2.1173923 , -1.8380868 , ..., -0.72417307,
         -0.86280835, -0.73799723],
        [-2.0038202 , -1.5480981 , -1.2718482 , ..., -0.7812969 ,
         -0.66750306, -0.75536823],
        ...,
        [-0.986085  , -0.83058625, -0.6588496 , ..., -0.52381754,
         -0.6392593 , -0.6377021 ],
        [-1.1207328 , -1.1697851 , -1.3221172 , ..., -0.74652433,
         -0.5505037 , -0.7463927 ],
        [-1.3293318 , -1.395412  , -1.6278265 , ..., -0.81956124,
         -0.5398238 , -0.776421  ]], dtype=float32)}