In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/ubuntu/mesolitica-tpu.json'

In [2]:
import tensorflow as tf
from malaya_speech.utils import subword

subwords = subword.load('transducer-singlish.subword')

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [3]:
import string

char_vocabs = [''] + list(string.ascii_lowercase + string.digits) + [' ']

In [4]:
files = tf.compat.v1.gfile.Glob('gs://mesolitica-tpu-general/imda/*/*.tfrecord')

In [5]:
sr = 16000
maxlen = 18
maxlen_subwords = 100
minlen_text = 1
global_count = 0

In [6]:
from google.cloud import storage
import numpy as np
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [7]:
def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
    return features

In [8]:
from tqdm import tqdm

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
    return features

def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    
    d = tf.data.TFRecordDataset(files)
    d = d.map(parse, num_parallel_calls=20)
    d = d.filter(
        lambda x: tf.less(tf.shape(x['waveforms'])[0] / sr, maxlen)
    )
    d = d.filter(
        lambda x: tf.less(tf.shape(x['targets'])[0], maxlen_subwords)
    )
    d = d.as_numpy_iterator()
    for r in tqdm(d):
        t = subword.decode(subwords, r['targets'])
        new_t = [char_vocabs.index(c) for c in t]
        example = to_example({'waveforms': r['waveforms'].tolist(), 
                              'targets': new_t, 
                              'targets_length': [len(new_t)],
                             'lang': [1]})
        writer.write(example.SerializeToString())
    
    writer.close()
    blob = bucket.blob(f'singlish/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [10]:
import mp

In [None]:
batch_size = 8
for i in range(0, len(files), batch_size):
    batch = files[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = len(batch), returned = False)
    global_count += 1

2466it [01:43, 19.29it/s]
4166it [02:27, 28.17it/s]
4166it [02:28, 27.96it/s]
4166it [02:31, 27.44it/s]
4166it [02:36, 26.58it/s]
4166it [02:39, 26.14it/s]
4166it [02:43, 25.44it/s]
4141it [02:48, 24.53it/s]
3553it [02:07, 36.83it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

4166it [02:26, 28.37it/s]
4166it [02:27, 28.23it/s]
4166it [02:30, 27.59it/s]
4166it [02:34, 26.95it/s]
4166it [02:34, 26.94it/s]
4166it [02:35, 26.76it/s]
4166it [02:36, 26.63it/s]
4166it [02:41, 25.72it/s]
985it [00:36, 26.71it/s]]
4166it [02:21, 29.53it/s]
4166it [02:23, 29.04it/s]
4166it [02:25, 28.63it/s]
4166it [02:29, 27.79it/s]
4166it [02:31, 27.51it/s]
4166it [02:34, 26.92it/s]
4164it [02:37, 26.44it/s]
3950it [02:27, 34.55it/s]


In [None]:
# loop((files[:1], 0))

In [None]:
# client = storage.Client()
# bucket = client.bucket('mesolitica-tpu-general')

In [None]:
# output_file = '0-0.tfrecord'
# blob = bucket.blob(f'singlish/{output_file}')
# blob.upload_from_filename(output_file)
# os.system(f'rm {output_file}')