In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [3]:
from glob import glob
import tensorflow as tf
from tqdm import tqdm
import malaya_speech
from malaya_speech.utils import subword
import numpy as np
import mp
from google.cloud import storage

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [4]:
singlish = glob('debate/wav/*.wav')
len(singlish)

234465

In [5]:
subwords = subword.load('transducer-singlish.subword')

In [6]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = string.replace('\'', '')
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [7]:
def get_after_mandarin(word):
    if '<mandarin>' in word:
        w = word.split('>')[1].split(':')[1]
        return w.split('</')[0]
    else:
        return word
    
def get_before_mandarin(word):
    if '</mandarin>' in word:
        return word.split('</')[0]
    else:
        return word

def replace_paralinguistic(string, replaces = ['(ppb)', '(ppc)', '(ppl)', '(ppo)', '<UNK>', '<MANDARIN>']):
    for r in replaces:
        string = string.replace(r, ' ')
    string = string.split()
    string = [get_after_mandarin(w) for w in string]
    string = [get_before_mandarin(w) for w in string]
    string = [w for w in string if w[0] not in '<[(' and w[-1] not in '>])']
    return ' '.join(string)

In [14]:
def loop(files):
    files, index = files
    results = []
    for i in tqdm(files):
        try:
            p = i.replace('/wav','/text')
            with open(f'{p}.txt') as fopen:
                text = fopen.read()
            if len(text) < 2:
                continue
            if text[0] == '<' and text[-1] == '>':
                continue
            text = replace_paralinguistic(text)
            text = preprocessing_text(text)
            if len(text):
                results.append((i, text))
        except Exception as e:
            pass
    return results

In [15]:
singlish[:10]

['debate/wav/app_4165_6329_phnd_deb-3-0-179.wav',
 'debate/wav/app_4154_6308_phnd_deb-1-0-4.wav',
 'debate/wav/app_4221_6441_phnd_deb-1-0-59.wav',
 'debate/wav/app_4238_6475_phnd_deb-1-0-170.wav',
 'debate/wav/app_4074_6148_phnd_deb-1-0-49.wav',
 'debate/wav/app_4153_6306_phnd_deb-3-0-178.wav',
 'debate/wav/app_4127_6254_phnd_deb-1-0-8.wav',
 'debate/wav/app_4228_6456_phnd_deb-2-0-138.wav',
 'debate/wav/app_4149_6297_phnd_deb-2-0-74.wav',
 'debate/wav/app_4168_6336_phnd_deb-3-0-52.wav']

In [16]:
loop((singlish[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 4980.77it/s]


[('debate/wav/app_4221_6441_phnd_deb-1-0-59.wav',
  'so theres not really a a a job issue here is just more for the opportunity or whether people want to take it in the first place'),
 ('debate/wav/app_4238_6475_phnd_deb-1-0-170.wav',
  'for brands and businesses to gain some sort of traction some sort reputation and i feel like that in a business point of view is'),
 ('debate/wav/app_4149_6297_phnd_deb-2-0-74.wav',
  'recycling so what should have to have been done what should have been done is that the latter')]

In [17]:
singlishs = mp.multiprocessing(singlish, loop, cores = 12)

100%|██████████| 19538/19538 [02:06<00:00, 153.98it/s]
100%|██████████| 19538/19538 [02:06<00:00, 153.89it/s]
100%|██████████| 9/9 [00:00<00:00, 145.53it/s]89it/s] 
100%|██████████| 19538/19538 [02:07<00:00, 152.97it/s]
100%|██████████| 19538/19538 [02:08<00:00, 152.57it/s]
100%|██████████| 19538/19538 [02:08<00:00, 152.61it/s]
100%|██████████| 19538/19538 [02:08<00:00, 152.49it/s]
100%|██████████| 19538/19538 [02:08<00:00, 152.28it/s]
100%|██████████| 19538/19538 [02:08<00:00, 152.19it/s]
100%|██████████| 19538/19538 [02:08<00:00, 151.73it/s]
100%|██████████| 19538/19538 [02:08<00:00, 151.61it/s]
100%|██████████| 19538/19538 [02:09<00:00, 150.52it/s]
100%|██████████| 19538/19538 [02:11<00:00, 149.14it/s]


In [18]:
len(singlishs)

126842

In [19]:
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [20]:
sr = 16000
maxlen = 18
minlen_text = 1
global_count = 0

In [21]:
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            if len(s[1]) < minlen_text:
                continue
            y, _ = malaya_speech.load(s[0])
            if (len(y) / sr) > maxlen:
                continue
            t = subword.encode(subwords, s[1], add_blank=False)
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': t, 
                                  'targets_length': [len(t)]})
            writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            pass
    writer.close()
    blob = bucket.blob(f'imda/part5-debate/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [22]:
loop((singlishs[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 14.62it/s]


In [23]:
batch_size = 25000
for i in range(0, len(singlishs), batch_size):
    batch = singlishs[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 6, returned = False)
    global_count += 1

 78%|███████▊  | 3249/4166 [04:19<00:54, 16.85it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 47%|████▋     | 1959/4166 [02:12<02:40, 13.72it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 4166/4166 [04:31<00:00, 15.35it/s]
100%|██████████| 4166/4166 [04:32<00:00, 15.28it/s]
100%|██████████| 4166/4166 [04:32<00:00, 15.26it/s]
100%|██████████| 4166/4166 [04:33<00:00, 15.22it/s]
100%|██████████| 4166/4166 [04:34<00:00,

In [24]:
from malaya_speech.utils import tf_featurization

config = malaya_speech.config.transducer_featurizer_config
featurizer = tf_featurization.STTFeaturizer(**config)

In [25]:
n_mels = 80

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    example['inputs'] = mel_fbanks
    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
        
    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

def get_dataset(files, batch_size = 2, shuffle_size = 32, thread_count = 24):
    def get():
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.shuffle(shuffle_size)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.repeat()
        return dataset

    return get

In [26]:
files = tf.io.gfile.glob('gs://mesolitica-tpu-general/imda/part5-debate/*.tfrecord')
d = get_dataset(files)()
d = d.as_numpy_iterator()

In [None]:
next(d)