In [1]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [2]:
from glob import glob
import tensorflow as tf
from tqdm import tqdm
import malaya_speech
from malaya_speech.utils import subword
import numpy as np
import mp
from google.cloud import storage

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [3]:
singlish = glob('call-centre-2/wav/*.wav')
len(singlish)

313460

In [4]:
subwords = subword.load('transducer-singlish.subword')

In [5]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = string.replace('\'', '')
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [6]:
def get_after_mandarin(word):
    if '<mandarin>' in word:
        w = word.split('>')[1].split(':')[1]
        return w.split('</')[0]
    else:
        return word
    
def get_before_mandarin(word):
    if '</mandarin>' in word:
        return word.split('</')[0]
    else:
        return word

def replace_paralinguistic(string, replaces = ['(ppb)', '(ppc)', '(ppl)', '(ppo)', '<UNK>', '<MANDARIN>']):
    for r in replaces:
        string = string.replace(r, ' ')
    string = string.split()
    string = [get_after_mandarin(w) for w in string]
    string = [get_before_mandarin(w) for w in string]
    string = [w for w in string if w[0] not in '<[(' and w[-1] not in '>])']
    return ' '.join(string)

In [7]:
def loop(files):
    files, index = files
    results = []
    for i in tqdm(files):
        try:
            p = i.replace('/wav','/text')
            with open(f'{p}.txt') as fopen:
                text = fopen.read()
            if len(text) < 2:
                continue
            if text[0] == '<' and text[-1] == '>':
                continue
            text = replace_paralinguistic(text)
            text = preprocessing_text(text)
            if len(text):
                results.append((i, text))
        except Exception as e:
            pass
    return results

In [8]:
loop((singlish[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 63.32it/s]


[('call-centre-2/wav/app_1136_5272_phnd_cc-ins-0-57.wav',
  'may i know how is the application process like and how long will it take for the application to be approved'),
 ('call-centre-2/wav/app_1110_5220_phnd_cc-tel-0-58.wav', 'okay okay'),
 ('call-centre-2/wav/app_1029_0026_phnd_cc-tel-0-38.wav',
  'okay nearest to you is a b c mall so you can actually go to a b c mall and actually apply it at one of our branches there alternatively you can actually go to online a b c telco dot com and you can actually use your myinfo which is your singpass to actually apply for it'),
 ('call-centre-2/wav/app_0857_0026_phnd_cc-ins-0-6.wav',
  'one two three and the last four digit of your policy number')]

In [9]:
singlishs = mp.multiprocessing(singlish, loop, cores = 12)

100%|██████████| 26121/26121 [01:39<00:00, 262.72it/s]
100%|██████████| 8/8 [00:00<00:00, 214.97it/s].18it/s]
100%|██████████| 26121/26121 [01:39<00:00, 261.74it/s]
100%|██████████| 26121/26121 [01:39<00:00, 261.39it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.95it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.72it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.83it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.22it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.26it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.07it/s]
100%|██████████| 26121/26121 [01:40<00:00, 259.97it/s]
100%|██████████| 26121/26121 [01:40<00:00, 259.86it/s]
100%|██████████| 26121/26121 [01:40<00:00, 260.29it/s]


In [10]:
len(singlishs)

168555

In [11]:
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [12]:
sr = 16000
maxlen = 18
minlen_text = 1
global_count = 0

In [13]:
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            if len(s[1]) < minlen_text:
                continue
            y, _ = malaya_speech.load(s[0])
            if (len(y) / sr) > maxlen:
                continue
            t = subword.encode(subwords, s[1], add_blank=False)
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': t, 
                                  'targets_length': [len(t)]})
            writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            pass
    writer.close()
    blob = bucket.blob(f'imda/part6-call-centre-2/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [14]:
loop((singlishs[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 19.21it/s]


In [15]:
batch_size = 25000
for i in range(0, len(singlishs), batch_size):
    batch = singlishs[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 6, returned = False)
    global_count += 1

 28%|██▊       | 1179/4166 [01:30<04:46, 10.42it/s]

zero-size array to reduction operation maximum which has no identity

 28%|██▊       | 1152/4166 [01:30<03:54, 12.85it/s]




 88%|████████▊ | 3678/4166 [04:22<00:34, 14.01it/s]

zero-size array to reduction operation maximum which has no identity

 89%|████████▉ | 3699/4166 [04:22<00:23, 19.63it/s]




100%|██████████| 4166/4166 [04:48<00:00, 14.46it/s]
 99%|█████████▉| 4138/4166 [04:49<00:01, 17.68it/s]

zero-size array to reduction operation maximum which has no identity

 99%|█████████▊| 4107/4166 [04:49<00:04, 14.74it/s]




100%|██████████| 4166/4166 [04:49<00:00, 14.39it/s]
100%|██████████| 4166/4166 [04:50<00:00, 14.34it/s]
100%|██████████| 4166/4166 [04:50<00:00, 14.33it/s]
100%|██████████| 4166/4166 [04:51<00:00, 14.31it/s]
100%|██████████| 4166/4166 [04:51<00:00, 14.27it/s]
100%|██████████| 4/4 [00:00<00:00, 39.03it/s]
  5%|▍         | 206/4166 [00:13<03:59, 16.53it/s]

zero-size array to reduction operation maximum which has no identity


 18%|█▊        | 750/4166 [00:47<03:09, 17.99it/s]

zero-size array to reduction operation maximum which has no identity


 35%|███▍      | 1438/4166 [01:33<04:22, 10.39it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:47<00:00, 14.51it/s]
100%|██████████| 4166/4166 [04:48<00:00, 14.45it/s]
100%|██████████| 4166/4166 [04:48<00:00, 14.44it/s]
100%|██████████| 4166/4166 [04:49<00:00, 14.37it/s]
100%|██████████| 4166/4166 [04:50<00:00, 14.34it/s]
100%|██████████| 4166/4166 [04:50<00:00, 14.33it/s]
100%|██████████| 4/4 [00:00<00:00, 55.53it/s]
 32%|███▏      | 1329/4166 [01:22<02:26, 19.37it/s]

zero-size array to reduction operation maximum which has no identity


 56%|█████▌    | 2321/4166 [02:25<02:56, 10.44it/s]

zero-size array to reduction operation maximum which has no identity

 55%|█████▍    | 2288/4166 [02:25<02:10, 14.38it/s]




 60%|██████    | 2500/4166 [02:38<01:18, 21.11it/s]

zero-size array to reduction operation maximum which has no identity


 94%|█████████▍| 3929/4166 [04:17<00:14, 16.31it/s]

zero-size array to reduction operation maximum which has no identity

 94%|█████████▍| 3925/4166 [04:17<00:12, 18.76it/s]




100%|██████████| 4166/4166 [04:27<00:00, 15.57it/s]
100%|██████████| 4166/4166 [04:28<00:00, 15.51it/s]
100%|██████████| 4166/4166 [04:30<00:00, 15.41it/s]
100%|██████████| 4166/4166 [04:31<00:00, 15.35it/s]
100%|██████████| 4166/4166 [04:31<00:00, 15.34it/s]
100%|██████████| 4166/4166 [04:33<00:00, 15.22it/s]
100%|██████████| 4/4 [00:00<00:00, 27.63it/s]
 35%|███▌      | 1469/4166 [01:34<02:16, 19.69it/s]

zero-size array to reduction operation maximum which has no identity

 35%|███▌      | 1477/4166 [01:34<02:56, 15.21it/s]




 49%|████▉     | 2048/4166 [02:11<02:04, 16.96it/s]

zero-size array to reduction operation maximum which has no identity


 81%|████████  | 3360/4166 [03:43<00:59, 13.44it/s]

zero-size array to reduction operation maximum which has no identity

 83%|████████▎ | 3470/4166 [03:43<00:48, 14.32it/s]




 95%|█████████▍| 3941/4166 [04:08<00:16, 14.05it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:22<00:00, 15.85it/s]
100%|██████████| 4166/4166 [04:24<00:00, 15.73it/s]
100%|██████████| 4166/4166 [04:26<00:00, 15.65it/s]
100%|██████████| 4166/4166 [04:26<00:00, 15.64it/s]
100%|██████████| 4166/4166 [04:28<00:00, 15.52it/s]
100%|██████████| 4166/4166 [04:30<00:00, 15.40it/s]
100%|██████████| 4/4 [00:00<00:00, 36.01it/s]
 19%|█▉        | 809/4166 [00:54<02:47, 20.02it/s]

zero-size array to reduction operation maximum which has no identity

 20%|██        | 843/4166 [00:54<03:45, 14.73it/s]




 46%|████▋     | 1930/4166 [02:08<02:07, 17.56it/s]

zero-size array to reduction operation maximum which has no identity


 52%|█████▏    | 2150/4166 [02:21<02:31, 13.31it/s]

zero-size array to reduction operation maximum which has no identity

 51%|█████▏    | 2144/4166 [02:20<02:42, 12.43it/s]




 60%|██████    | 2505/4166 [02:40<01:29, 18.63it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:29<00:00, 15.44it/s]
100%|██████████| 4166/4166 [04:31<00:00, 15.35it/s]
100%|██████████| 4166/4166 [04:32<00:00, 15.28it/s]
100%|██████████| 4166/4166 [04:34<00:00, 15.18it/s]
100%|██████████| 4166/4166 [04:35<00:00, 15.14it/s]
100%|██████████| 4166/4166 [04:35<00:00, 15.11it/s]
100%|██████████| 4/4 [00:00<00:00, 37.80it/s]
 45%|████▌     | 1887/4166 [01:58<02:34, 14.75it/s]

zero-size array to reduction operation maximum which has no identity

 45%|████▍     | 1871/4166 [01:58<01:57, 19.45it/s]




 59%|█████▊    | 2445/4166 [02:36<02:22, 12.08it/s]

zero-size array to reduction operation maximum which has no identity

 59%|█████▊    | 2439/4166 [02:36<02:04, 13.87it/s]




 92%|█████████▏| 3852/4166 [04:13<00:21, 14.91it/s]

zero-size array to reduction operation maximum which has no identity

 93%|█████████▎| 3884/4166 [04:13<00:21, 13.13it/s]




100%|██████████| 4166/4166 [04:30<00:00, 15.41it/s]
100%|██████████| 4166/4166 [04:31<00:00, 15.36it/s]
100%|██████████| 4166/4166 [04:31<00:00, 15.35it/s]
100%|██████████| 4166/4166 [04:32<00:00, 15.31it/s]
100%|██████████| 4166/4166 [04:33<00:00, 15.26it/s]
100%|██████████| 4166/4166 [04:33<00:00, 15.23it/s]
100%|██████████| 4/4 [00:00<00:00, 36.77it/s]
 17%|█▋        | 529/3092 [00:32<03:05, 13.80it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 3092/3092 [03:15<00:00, 15.83it/s]
100%|██████████| 3092/3092 [03:20<00:00, 15.45it/s]
100%|██████████| 3092/3092 [03:20<00:00, 15.42it/s]
100%|██████████| 3092/3092 [03:20<00:00, 15.41it/s]
100%|██████████| 3092/3092 [03:21<00:00, 15.34it/s]
100%|██████████| 3092/3092 [03:21<00:00, 15.33it/s]
100%|██████████| 3/3 [00:00<00:00, 27.85it/s]


In [16]:
from malaya_speech.utils import tf_featurization

config = malaya_speech.config.transducer_featurizer_config
featurizer = tf_featurization.STTFeaturizer(**config)

In [17]:
n_mels = 80

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    example['inputs'] = mel_fbanks
    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
        
    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

def get_dataset(files, batch_size = 2, shuffle_size = 32, thread_count = 24):
    def get():
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.shuffle(shuffle_size)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.repeat()
        return dataset

    return get

In [18]:
files = tf.io.gfile.glob('gs://mesolitica-tpu-general/imda/part6-call-centre-2/*.tfrecord')
d = get_dataset(files)()
d = d.as_numpy_iterator()

In [20]:
next(d)

{'targets': array([459, 617, 879]),
 'targets_length': array([3]),
 'waveforms': array([ 0.        ,  0.00056001, -0.00448012, ...,  0.01456039,
         0.02090722,  0.02072055], dtype=float32),
 'inputs': array([[-2.8121428 , -2.6286774 , -2.4116654 , ..., -0.64693004,
         -0.76647955, -1.0613824 ],
        [-2.8782306 , -2.5453875 , -2.2406893 , ..., -0.86372626,
         -0.95642406, -0.97118413],
        [-2.7792926 , -2.6496296 , -2.479748  , ..., -0.7164855 ,
         -0.81425023, -0.67038786],
        ...,
        [-3.6157432 , -3.6185439 , -3.6405878 , ..., -0.50747216,
         -0.71748173, -0.9155589 ],
        [-3.2117019 , -2.8298361 , -2.5052295 , ..., -0.69619983,
         -1.0000577 , -0.77986073],
        [-2.614395  , -2.4328983 , -2.217421  , ..., -0.7227667 ,
         -1.086185  , -0.95514673]], dtype=float32)}

In [21]:
!rm -rf call-centre-2