In [2]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'

In [3]:
from glob import glob
import tensorflow as tf
from tqdm import tqdm
import malaya_speech
from malaya_speech.utils import subword
import numpy as np
import mp
from google.cloud import storage

  'Cannot import beam_search_ops from Tensorflow Addons, `deep_model` for stemmer will not available to use, make sure Tensorflow Addons version >= 0.12.0'


In [5]:
singlish = glob('call-centre-1/wav/*.wav')
len(singlish)

388081

In [6]:
subwords = subword.load('transducer-singlish.subword')

In [7]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    string = unicodedata.normalize('NFC', string.lower())
    string = string.replace('\'', '')
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [8]:
def get_after_mandarin(word):
    if '<mandarin>' in word:
        w = word.split('>')[1].split(':')[1]
        return w.split('</')[0]
    else:
        return word
    
def get_before_mandarin(word):
    if '</mandarin>' in word:
        return word.split('</')[0]
    else:
        return word

def replace_paralinguistic(string, replaces = ['(ppb)', '(ppc)', '(ppl)', '(ppo)', '<UNK>', '<MANDARIN>']):
    for r in replaces:
        string = string.replace(r, ' ')
    string = string.split()
    string = [get_after_mandarin(w) for w in string]
    string = [get_before_mandarin(w) for w in string]
    string = [w for w in string if w[0] not in '<[(' and w[-1] not in '>])']
    return ' '.join(string)

In [9]:
def loop(files):
    files, index = files
    results = []
    for i in tqdm(files):
        try:
            p = i.replace('/wav','/text')
            with open(f'{p}.txt') as fopen:
                text = fopen.read()
            if len(text) < 2:
                continue
            if text[0] == '<' and text[-1] == '>':
                continue
            text = replace_paralinguistic(text)
            text = preprocessing_text(text)
            if len(text):
                results.append((i, text))
        except Exception as e:
            pass
    return results

In [10]:
loop((singlish[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 558.01it/s]


[('call-centre-1/wav/app_0489_0009_phnd_cc-hot-0-0.wav',
  'one two three hotel'),
 ('call-centre-1/wav/app_0169_3338_phnd_cc-hot-0-37.wav', 'yes i am'),
 ('call-centre-1/wav/app_0388_3776_phnd_cc-hol-0-132.wav',
  'no qualms about that its only this'),
 ('call-centre-1/wav/app_0523_0018_phnd_cc-res-0-109.wav',
  'okay sure i will put put it in as well'),
 ('call-centre-1/wav/app_0155_0001_phnd_cc-hot-0-52.wav',
  'okay move on to negative feedback and in'),
 ('call-centre-1/wav/app_0547_0001_phnd_cc-hol-0-70.wav', 'ya'),
 ('call-centre-1/wav/app_0638_4276_phnd_cc-hol-0-65.wav',
  'and my in and then seems to be not interested in my enq enquiry')]

In [11]:
singlishs = mp.multiprocessing(singlish, loop, cores = 12)

100%|██████████| 32340/32340 [06:34<00:00, 81.93it/s]  
100%|██████████| 1/1 [00:00<00:00, 12.39it/s]6.57it/s]
100%|██████████| 32340/32340 [06:35<00:00, 81.85it/s] 
100%|██████████| 32340/32340 [06:35<00:00, 81.81it/s] 
100%|██████████| 32340/32340 [06:35<00:00, 81.77it/s] 
100%|██████████| 32340/32340 [06:35<00:00, 81.84it/s] 
100%|██████████| 32340/32340 [06:36<00:00, 81.65it/s] 
100%|██████████| 32340/32340 [06:36<00:00, 81.65it/s] 
100%|██████████| 32340/32340 [06:36<00:00, 81.55it/s] 
100%|██████████| 32340/32340 [06:36<00:00, 81.57it/s] 
100%|██████████| 32340/32340 [06:36<00:00, 81.51it/s] 
100%|██████████| 32340/32340 [06:37<00:00, 81.41it/s] ]
100%|██████████| 32340/32340 [06:37<00:00, 81.41it/s]  


In [12]:
len(singlishs)

238343

In [13]:
import six

def to_example(dictionary):
    """Helper: build tf.Example from (string -> int/float/str list) dictionary."""
    features = {}
    for (k, v) in six.iteritems(dictionary):
        if not v:
            raise ValueError('Empty generated field: %s' % str((k, v)))
        # Subtly in PY2 vs PY3, map is not scriptable in py3. As a result,
        # map objects will fail with TypeError, unless converted to a list.
        if six.PY3 and isinstance(v, map):
            v = list(v)
        if isinstance(v[0], six.integer_types) or np.issubdtype(
            type(v[0]), np.integer
        ):
            features[k] = tf.train.Feature(
                int64_list=tf.train.Int64List(value=v)
            )
        elif isinstance(v[0], float):
            features[k] = tf.train.Feature(
                float_list=tf.train.FloatList(value=v)
            )
        elif isinstance(v[0], six.string_types):
            if not six.PY2:  # Convert in python 3.
                v = [bytes(x, 'utf-8') for x in v]
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        elif isinstance(v[0], bytes):
            features[k] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=v)
            )
        else:
            raise ValueError(
                'Value for %s is not a recognized type; v: %s type: %s'
                % (k, str(v[0]), str(type(v[0])))
            )
    return tf.train.Example(features=tf.train.Features(feature=features))

In [14]:
sr = 16000
maxlen = 18
minlen_text = 1
global_count = 0

In [15]:
def loop(files):
    client = storage.Client()
    bucket = client.bucket('mesolitica-tpu-general')
    files, index = files
    output_file = f'{index}-{global_count}.tfrecord'
    writer = tf.io.TFRecordWriter(output_file)
    for s in tqdm(files):
        try:
            if len(s[1]) < minlen_text:
                continue
            y, _ = malaya_speech.load(s[0])
            if (len(y) / sr) > maxlen:
                continue
            t = subword.encode(subwords, s[1], add_blank=False)
            example = to_example({'waveforms': y.tolist(), 
                                  'targets': t, 
                                  'targets_length': [len(t)]})
            writer.write(example.SerializeToString())
        except Exception as e:
            print(e)
            pass
    writer.close()
    blob = bucket.blob(f'imda/part6-call-centre-1/{output_file}')
    blob.upload_from_filename(output_file)
    os.system(f'rm {output_file}')

In [16]:
loop((singlishs[:10], 0))

100%|██████████| 10/10 [00:00<00:00, 17.31it/s]


In [17]:
batch_size = 25000
for i in range(0, len(singlishs), batch_size):
    batch = singlishs[i: i + batch_size]
    mp.multiprocessing(batch, loop, cores = 6, returned = False)
    global_count += 1

  2%|▏         | 82/4166 [00:05<03:59, 17.04it/s]

zero-size array to reduction operation maximum which has no identity


  6%|▌         | 233/4166 [00:16<03:45, 17.42it/s]

zero-size array to reduction operation maximum which has no identity

  5%|▌         | 217/4166 [00:16<05:54, 11.14it/s]




 12%|█▏        | 499/4166 [00:34<04:13, 14.49it/s]

zero-size array to reduction operation maximum which has no identity


 15%|█▌        | 634/4166 [00:44<03:18, 17.79it/s]

zero-size array to reduction operation maximum which has no identity

 16%|█▌        | 648/4166 [00:44<03:48, 15.42it/s]




 17%|█▋        | 688/4166 [00:47<03:08, 18.47it/s]

zero-size array to reduction operation maximum which has no identity


 17%|█▋        | 717/4166 [00:49<03:40, 15.61it/s]

zero-size array to reduction operation maximum which has no identity


 21%|██▏       | 889/4166 [01:00<04:30, 12.13it/s]

zero-size array to reduction operation maximum which has no identity


 27%|██▋       | 1130/4166 [01:17<03:20, 15.15it/s]

zero-size array to reduction operation maximum which has no identity


 40%|███▉      | 1653/4166 [01:53<04:43,  8.86it/s]

zero-size array to reduction operation maximum which has no identity


 55%|█████▍    | 2290/4166 [02:38<02:15, 13.85it/s]

zero-size array to reduction operation maximum which has no identity

 55%|█████▍    | 2271/4166 [02:38<02:43, 11.61it/s]




 69%|██████▊   | 2861/4166 [03:18<01:34, 13.87it/s]

zero-size array to reduction operation maximum which has no identity


 73%|███████▎  | 3052/4166 [03:29<01:00, 18.29it/s]

zero-size array to reduction operation maximum which has no identity


 75%|███████▍  | 3118/4166 [03:34<01:14, 14.13it/s]

zero-size array to reduction operation maximum which has no identity


 78%|███████▊  | 3234/4166 [03:44<00:56, 16.46it/s]

zero-size array to reduction operation maximum which has no identity

 77%|███████▋  | 3193/4166 [03:44<01:07, 14.36it/s]




 85%|████████▌ | 3546/4166 [04:03<00:37, 16.43it/s]

zero-size array to reduction operation maximum which has no identity

 85%|████████▌ | 3546/4166 [04:03<00:49, 12.59it/s]




 90%|████████▉ | 3742/4166 [04:16<00:26, 15.73it/s]

zero-size array to reduction operation maximum which has no identity

 90%|████████▉ | 3732/4166 [04:16<00:34, 12.66it/s]




 90%|████████▉ | 3749/4166 [04:22<00:34, 12.08it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:43<00:00, 14.70it/s]
100%|██████████| 4166/4166 [04:43<00:00, 14.67it/s]
100%|██████████| 4166/4166 [04:44<00:00, 14.66it/s]
100%|██████████| 4166/4166 [04:44<00:00, 14.65it/s]
100%|██████████| 4166/4166 [04:45<00:00, 14.60it/s]
100%|██████████| 4166/4166 [04:47<00:00, 14.48it/s]
100%|██████████| 4/4 [00:00<00:00, 31.72it/s]
  7%|▋         | 292/4166 [00:20<04:07, 15.63it/s]

zero-size array to reduction operation maximum which has no identity


  9%|▉         | 395/4166 [00:28<04:53, 12.85it/s]

zero-size array to reduction operation maximum which has no identity

 10%|▉         | 416/4166 [00:28<03:02, 20.60it/s]




 11%|█         | 453/4166 [00:30<03:28, 17.78it/s]

zero-size array to reduction operation maximum which has no identity


 28%|██▊       | 1148/4166 [01:17<03:19, 15.11it/s]

zero-size array to reduction operation maximum which has no identity


 29%|██▉       | 1228/4166 [01:22<02:52, 17.00it/s]

zero-size array to reduction operation maximum which has no identity


 30%|███       | 1250/4166 [01:23<02:26, 19.85it/s]

zero-size array to reduction operation maximum which has no identity


 35%|███▌      | 1467/4166 [01:39<03:07, 14.38it/s]

zero-size array to reduction operation maximum which has no identity

 35%|███▍      | 1448/4166 [01:39<03:42, 12.23it/s]




 52%|█████▏    | 2164/4166 [02:26<02:27, 13.58it/s]

zero-size array to reduction operation maximum which has no identity

 52%|█████▏    | 2180/4166 [02:26<01:42, 19.35it/s]




 59%|█████▉    | 2465/4166 [02:46<01:44, 16.34it/s]

zero-size array to reduction operation maximum which has no identity


 63%|██████▎   | 2628/4166 [02:57<01:39, 15.44it/s]

zero-size array to reduction operation maximum which has no identity

 63%|██████▎   | 2643/4166 [02:57<01:34, 16.04it/s]




 74%|███████▎  | 3067/4166 [03:32<00:47, 22.94it/s]

zero-size array to reduction operation maximum which has no identity

 73%|███████▎  | 3026/4166 [03:32<01:21, 13.94it/s]




 83%|████████▎ | 3474/4166 [04:01<01:01, 11.27it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4/4 [00:00<00:00, 27.68it/s]8it/s]
  4%|▎         | 155/4166 [00:10<04:29, 14.86it/s]

zero-size array to reduction operation maximum which has no identity


 16%|█▌        | 676/4166 [00:44<03:46, 15.40it/s]

zero-size array to reduction operation maximum which has no identity

 16%|█▌        | 660/4166 [00:44<05:04, 11.50it/s]




 17%|█▋        | 721/4166 [00:48<03:27, 16.60it/s]

zero-size array to reduction operation maximum which has no identity


 21%|██        | 881/4166 [00:58<02:49, 19.42it/s]

zero-size array to reduction operation maximum which has no identity

 20%|██        | 845/4166 [00:57<03:35, 15.43it/s]




 22%|██▏       | 903/4166 [00:58<02:30, 21.71it/s]

zero-size array to reduction operation maximum which has no identity


 26%|██▌       | 1077/4166 [01:13<03:50, 13.38it/s]

zero-size array to reduction operation maximum which has no identity


 27%|██▋       | 1116/4166 [01:14<02:16, 22.34it/s]

zero-size array to reduction operation maximum which has no identity

 27%|██▋       | 1141/4166 [01:14<02:55, 17.28it/s]




 28%|██▊       | 1176/4166 [01:20<02:53, 17.21it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|███████▍  | 3080/4166 [03:29<01:07, 16.11it/s]

zero-size array to reduction operation maximum which has no identity


 76%|███████▋  | 3180/4166 [03:33<01:12, 13.63it/s]

zero-size array to reduction operation maximum which has no identity


 91%|█████████ | 3788/4166 [04:14<00:21, 17.26it/s]

zero-size array to reduction operation maximum which has no identity


 94%|█████████▍| 3935/4166 [04:25<00:18, 12.76it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 31%|███       | 1300/4166 [01:24<03:36, 13.22it/s]

zero-size array to reduction operation maximum which has no identity


 31%|███       | 1300/4166 [01:25<03:09, 15.09it/s]

zero-size array to reduction operation maximum which has no identity


 34%|███▍      | 1421/4166 [01:33<02:53, 15.81it/s]

zero-size array to reduction operation maximum which has no identity


 37%|███▋      | 1538/4166 [01:42<03:06, 14.13it/s]

zero-size array to reduction operation maximum which has no identity


 57%|█████▋    | 2365/4166 [02:36<02:16, 13.23it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

100%|██████████| 4166/4166 [04:31<00:00, 15.35it/s]
100%|██████████| 4166/4166 [04:34<00:00, 15.16it/s]
100%|██████████| 4166/4166 [04:34<00:00, 15.15it/s]
100%|██████████| 4166/4166 [04:36<00:00, 15.09it/s]
100%|██████████| 4166/4166 [04:36<00:00, 15.05it/s]
100%|██████████| 4166/4166 [04:37<00:00, 15.03it/s]
100%|██████████| 4/4 [00:00<00:00, 21.79it/s]
  5%|▌         | 213/4166 [00:14<05:12, 12.65it/s]

zero-size array to reduction operation maximum which has no identity

  5%|▌         | 221/4166 [00:14<04:34, 14.36it/s]




  8%|▊         | 316/4166 [00:20<04:45, 13.49it/s]

zero-size array to reduction operation maximum which has no identity


  8%|▊         | 318/4166 [00:21<03:42, 17.27it/s]

zero-size array to reduction operation maximum which has no identity


 18%|█▊        | 753/4166 [00:49<04:20, 13.12it/s]

zero-size array to reduction operation maximum which has no identity

 17%|█▋        | 716/4166 [00:49<03:39, 15.74it/s]




 24%|██▍       | 993/4166 [01:08<03:47, 13.92it/s]]

zero-size array to reduction operation maximum which has no identity

 25%|██▍       | 1026/4166 [01:08<03:34, 14.66it/s]




 34%|███▍      | 1410/4166 [01:33<03:36, 12.74it/s]

zero-size array to reduction operation maximum which has no identity


 39%|███▊      | 1614/4166 [01:49<02:36, 16.32it/s]

zero-size array to reduction operation maximum which has no identity


 41%|████▏     | 1726/4166 [01:54<02:46, 14.68it/s]

zero-size array to reduction operation maximum which has no identity


 40%|████      | 1687/4166 [01:56<02:48, 14.74it/s]

zero-size array to reduction operation maximum which has no identity


 42%|████▏     | 1743/4166 [01:58<03:54, 10.35it/s]

zero-size array to reduction operation maximum which has no identity


 46%|████▌     | 1923/4166 [02:12<03:08, 11.89it/s]

zero-size array to reduction operation maximum which has no identity

 49%|████▉     | 2033/4166 [02:12<01:56, 18.27it/s]




 59%|█████▊    | 2442/4166 [02:45<01:37, 17.63it/s]

zero-size array to reduction operation maximum which has no identity


 60%|██████    | 2506/4166 [02:48<02:51,  9.66it/s]

zero-size array to reduction operation maximum which has no identity


 66%|██████▋   | 2770/4166 [03:12<01:31, 15.31it/s]

zero-size array to reduction operation maximum which has no identity


 87%|████████▋ | 3630/4166 [04:12<00:33, 16.07it/s]

zero-size array to reduction operation maximum which has no identity


 86%|████████▌ | 3578/4166 [04:12<00:48, 12.24it/s]

zero-size array to reduction operation maximum which has no identity


 87%|████████▋ | 3611/4166 [04:14<00:37, 14.61it/s]

zero-size array to reduction operation maximum which has no identity


 88%|████████▊ | 3686/4166 [04:19<00:29, 16.52it/s]

zero-size array to reduction operation maximum which has no identity


 93%|█████████▎| 3884/4166 [04:31<00:20, 13.70it/s]

zero-size array to reduction operation maximum which has no identity

 95%|█████████▍| 3952/4166 [04:31<00:13, 15.70it/s]




 93%|█████████▎| 3866/4166 [04:32<00:17, 17.25it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:43<00:00, 14.69it/s]
100%|██████████| 4166/4166 [04:45<00:00, 14.61it/s]
100%|██████████| 4166/4166 [04:46<00:00, 14.56it/s]
100%|██████████| 4166/4166 [04:47<00:00, 14.51it/s]
100%|██████████| 4166/4166 [04:48<00:00, 14.42it/s]
100%|██████████| 4166/4166 [04:49<00:00, 14.40it/s]
100%|██████████| 4/4 [00:00<00:00, 21.94it/s]
  2%|▏         | 68/4166 [00:04<04:30, 15.17it/s]

zero-size array to reduction operation maximum which has no identity

  1%|▏         | 58/4166 [00:04<05:09, 13.27it/s]




  4%|▍         | 181/4166 [00:12<03:21, 19.75it/s]

zero-size array to reduction operation maximum which has no identity


  8%|▊         | 315/4166 [00:19<03:18, 19.43it/s]

zero-size array to reduction operation maximum which has no identity


 18%|█▊        | 768/4166 [00:52<04:44, 11.93it/s]

zero-size array to reduction operation maximum which has no identity


 38%|███▊      | 1568/4166 [01:47<02:42, 15.98it/s]

zero-size array to reduction operation maximum which has no identity


 52%|█████▏    | 2164/4166 [02:26<02:31, 13.23it/s]

zero-size array to reduction operation maximum which has no identity

 51%|█████     | 2107/4166 [02:26<01:42, 20.01it/s]




 58%|█████▊    | 2433/4166 [02:44<01:46, 16.29it/s]

zero-size array to reduction operation maximum which has no identity

 57%|█████▋    | 2381/4166 [02:44<01:57, 15.17it/s]




 63%|██████▎   | 2619/4166 [03:00<02:19, 11.08it/s]

zero-size array to reduction operation maximum which has no identity


 68%|██████▊   | 2828/4166 [03:14<01:49, 12.26it/s]

zero-size array to reduction operation maximum which has no identity


 84%|████████▎ | 3485/4166 [03:57<00:47, 14.27it/s]

zero-size array to reduction operation maximum which has no identity

 83%|████████▎ | 3466/4166 [03:57<01:13,  9.58it/s]




 88%|████████▊ | 3661/4166 [04:11<00:40, 12.33it/s]

zero-size array to reduction operation maximum which has no identity

 89%|████████▉ | 3728/4166 [04:11<00:32, 13.46it/s]




100%|██████████| 4166/4166 [04:40<00:00, 14.86it/s]
100%|██████████| 4166/4166 [04:41<00:00, 14.79it/s]
100%|██████████| 4166/4166 [04:41<00:00, 14.79it/s]
100%|██████████| 4166/4166 [04:43<00:00, 14.67it/s]
100%|██████████| 4166/4166 [04:43<00:00, 14.67it/s]
100%|██████████| 4166/4166 [04:44<00:00, 14.66it/s]
100%|██████████| 4/4 [00:00<00:00, 36.84it/s]
 15%|█▌        | 629/4166 [00:41<04:47, 12.31it/s]

zero-size array to reduction operation maximum which has no identity


 17%|█▋        | 695/4166 [00:48<03:54, 14.79it/s]

zero-size array to reduction operation maximum which has no identity


 35%|███▍      | 1454/4166 [01:37<03:14, 13.97it/s]

zero-size array to reduction operation maximum which has no identity

 35%|███▌      | 1471/4166 [01:37<02:28, 18.12it/s]




 48%|████▊     | 2000/4166 [02:12<02:32, 14.19it/s]

zero-size array to reduction operation maximum which has no identity


 53%|█████▎    | 2215/4166 [02:27<02:18, 14.04it/s]

zero-size array to reduction operation maximum which has no identity

 54%|█████▎    | 2238/4166 [02:27<01:52, 17.13it/s]




 63%|██████▎   | 2620/4166 [02:56<01:22, 18.83it/s]

zero-size array to reduction operation maximum which has no identity


 79%|███████▉  | 3290/4166 [03:38<00:40, 21.64it/s]

zero-size array to reduction operation maximum which has no identity

 78%|███████▊  | 3239/4166 [03:38<01:11, 12.95it/s]




 78%|███████▊  | 3248/4166 [03:38<01:09, 13.29it/s]

zero-size array to reduction operation maximum which has no identity


 82%|████████▏ | 3428/4166 [03:50<00:47, 15.62it/s]

zero-size array to reduction operation maximum which has no identity


 87%|████████▋ | 3640/4166 [04:04<00:36, 14.39it/s]

zero-size array to reduction operation maximum which has no identity

 88%|████████▊ | 3682/4166 [04:04<00:43, 11.15it/s]




 88%|████████▊ | 3686/4166 [04:05<00:31, 15.29it/s]

zero-size array to reduction operation maximum which has no identity


 89%|████████▉ | 3707/4166 [04:10<00:35, 12.89it/s]

zero-size array to reduction operation maximum which has no identity

 90%|█████████ | 3753/4166 [04:10<00:24, 16.77it/s]




 94%|█████████▍| 3910/4166 [04:20<00:12, 21.09it/s]

zero-size array to reduction operation maximum which has no identity

 94%|█████████▎| 3901/4166 [04:20<00:17, 15.02it/s]




100%|██████████| 4166/4166 [04:36<00:00, 15.05it/s]
100%|██████████| 4166/4166 [04:38<00:00, 14.97it/s]
100%|██████████| 4166/4166 [04:39<00:00, 14.92it/s]
100%|██████████| 4166/4166 [04:39<00:00, 14.92it/s]
100%|██████████| 4166/4166 [04:39<00:00, 14.89it/s]
100%|██████████| 4166/4166 [04:39<00:00, 14.89it/s]
100%|██████████| 4/4 [00:00<00:00, 41.88it/s]
 17%|█▋        | 688/4166 [00:46<03:38, 15.91it/s]

zero-size array to reduction operation maximum which has no identity


 20%|█▉        | 821/4166 [00:55<04:54, 11.37it/s]

zero-size array to reduction operation maximum which has no identity


 21%|██▏       | 892/4166 [01:00<04:06, 13.30it/s]

zero-size array to reduction operation maximum which has no identity

 21%|██        | 878/4166 [01:00<03:59, 13.70it/s]




 43%|████▎     | 1801/4166 [01:59<02:50, 13.89it/s]

zero-size array to reduction operation maximum which has no identity


 43%|████▎     | 1791/4166 [02:00<02:24, 16.48it/s]

zero-size array to reduction operation maximum which has no identity


 51%|█████     | 2133/4166 [02:22<02:22, 14.29it/s]

zero-size array to reduction operation maximum which has no identity


 52%|█████▏    | 2161/4166 [02:25<05:06,  6.54it/s]

zero-size array to reduction operation maximum which has no identity


 59%|█████▉    | 2448/4166 [02:52<02:10, 13.12it/s]

zero-size array to reduction operation maximum which has no identity

 60%|██████    | 2509/4166 [02:52<01:51, 14.85it/s]




 95%|█████████▌| 3968/4166 [04:35<00:11, 17.48it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:44<00:00, 14.64it/s]
100%|██████████| 4166/4166 [04:48<00:00, 14.44it/s]
100%|██████████| 4166/4166 [04:48<00:00, 14.43it/s]
100%|██████████| 4166/4166 [04:49<00:00, 14.41it/s]
100%|██████████| 4166/4166 [04:49<00:00, 14.40it/s]
100%|██████████| 4166/4166 [04:50<00:00, 14.32it/s]
100%|██████████| 4/4 [00:00<00:00, 24.74it/s]
  9%|▊         | 360/4166 [00:25<03:58, 15.97it/s]

zero-size array to reduction operation maximum which has no identity

  9%|▊         | 357/4166 [00:25<05:21, 11.87it/s]




  9%|▉         | 366/4166 [00:26<05:07, 12.37it/s]

zero-size array to reduction operation maximum which has no identity


 11%|█         | 446/4166 [00:30<03:53, 15.90it/s]

zero-size array to reduction operation maximum which has no identity

 11%|█         | 446/4166 [00:30<04:48, 12.88it/s]




 14%|█▍        | 586/4166 [00:39<08:33,  6.97it/s]

zero-size array to reduction operation maximum which has no identity

 14%|█▍        | 573/4166 [00:38<06:48,  8.80it/s]




 15%|█▌        | 643/4166 [00:44<04:22, 13.43it/s]

zero-size array to reduction operation maximum which has no identity


 28%|██▊       | 1183/4166 [01:19<03:25, 14.48it/s]

zero-size array to reduction operation maximum which has no identity

 28%|██▊       | 1157/4166 [01:19<03:01, 16.61it/s]




 36%|███▌      | 1506/4166 [01:41<02:33, 17.35it/s]

zero-size array to reduction operation maximum which has no identity

 36%|███▌      | 1505/4166 [01:42<02:35, 17.15it/s]




 48%|████▊     | 1985/4166 [02:14<02:41, 13.51it/s]

zero-size array to reduction operation maximum which has no identity

 48%|████▊     | 2006/4166 [02:14<02:18, 15.65it/s]




 49%|████▉     | 2033/4166 [02:16<02:39, 13.39it/s]

zero-size array to reduction operation maximum which has no identity


 56%|█████▋    | 2347/4166 [02:36<02:07, 14.27it/s]

zero-size array to reduction operation maximum which has no identity


 76%|███████▌  | 3166/4166 [03:35<01:12, 13.89it/s]

zero-size array to reduction operation maximum which has no identity


 97%|█████████▋| 4040/4166 [04:35<00:07, 16.33it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 4166/4166 [04:38<00:00, 14.97it/s]
100%|██████████| 4166/4166 [04:39<00:00, 14.92it/s]
100%|██████████| 4166/4166 [04:43<00:00, 14.71it/s]
100%|██████████| 4166/4166 [04:43<00:00, 14.69it/s]
100%|██████████| 4166/4166 [04:44<00:00, 14.66it/s]
100%|██████████| 4166/4166 [04:44<00:00, 14.64it/s]
100%|██████████| 4/4 [00:00<00:00, 40.40it/s]
  4%|▍         | 95/2223 [00:06<02:53, 12.26it/s]]

zero-size array to reduction operation maximum which has no identity

  4%|▎         | 81/2223 [00:06<02:41, 13.24it/s]




  5%|▌         | 115/2223 [00:08<02:46, 12.67it/s]

zero-size array to reduction operation maximum which has no identity

  6%|▌         | 133/2223 [00:08<02:30, 13.86it/s]




 31%|███       | 680/2223 [00:43<01:32, 16.75it/s]

zero-size array to reduction operation maximum which has no identity

 29%|██▉       | 644/2223 [00:43<01:56, 13.57it/s]




 34%|███▍      | 764/2223 [00:50<01:55, 12.65it/s]

zero-size array to reduction operation maximum which has no identity


 38%|███▊      | 847/2223 [00:56<01:21, 16.91it/s]

zero-size array to reduction operation maximum which has no identity


 39%|███▉      | 869/2223 [00:58<01:29, 15.16it/s]

zero-size array to reduction operation maximum which has no identity

 39%|███▉      | 870/2223 [00:58<01:30, 15.00it/s]




 80%|███████▉  | 1778/2223 [02:00<00:27, 16.06it/s]

zero-size array to reduction operation maximum which has no identity

 80%|███████▉  | 1772/2223 [02:00<00:30, 14.60it/s]




 88%|████████▊ | 1967/2223 [02:12<00:16, 15.56it/s]

zero-size array to reduction operation maximum which has no identity


100%|██████████| 2223/2223 [02:28<00:00, 14.97it/s]
100%|██████████| 2223/2223 [02:28<00:00, 14.92it/s]
100%|██████████| 2223/2223 [02:29<00:00, 14.89it/s]
100%|██████████| 2223/2223 [02:29<00:00, 14.84it/s]
100%|██████████| 2223/2223 [02:29<00:00, 14.83it/s]
100%|██████████| 2223/2223 [02:30<00:00, 14.78it/s]
100%|██████████| 5/5 [00:00<00:00, 57.10it/s]


In [18]:
from malaya_speech.utils import tf_featurization

config = malaya_speech.config.transducer_featurizer_config
featurizer = tf_featurization.STTFeaturizer(**config)

In [19]:
n_mels = 80

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    example['inputs'] = mel_fbanks
    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values
        
    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

def get_dataset(files, batch_size = 2, shuffle_size = 32, thread_count = 24):
    def get():
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.shuffle(shuffle_size)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.repeat()
        return dataset

    return get

In [20]:
files = tf.io.gfile.glob('gs://mesolitica-tpu-general/imda/part6-call-centre-1/*.tfrecord')
d = get_dataset(files)()
d = d.as_numpy_iterator()

In [21]:
next(d)

{'targets': array([  8, 795,  69,  16,   8, 795,   1,   9, 172, 432,   4,  34,  29,
        795, 106, 407, 278,  11,  16,  69, 168, 876, 795, 168, 279, 305]),
 'targets_length': array([26]),
 'waveforms': array([0.02579204, 0.01462226, 0.00355402, ..., 0.00863119, 0.01045898,
        0.00781885], dtype=float32),
 'inputs': array([[-3.259451  , -3.2115316 , -3.1417866 , ..., -0.4800198 ,
         -0.79051363, -0.6523645 ],
        [-3.0369134 , -2.9883745 , -2.9177563 , ..., -0.71144384,
         -0.59316975, -0.6146458 ],
        [-3.3031645 , -2.9903703 , -2.7309544 , ..., -0.7843823 ,
         -0.5416616 , -0.39892134],
        ...,
        [-4.557473  , -4.1015425 , -3.8034575 , ..., -0.93814534,
         -0.85005796, -0.51457846],
        [-3.3352256 , -2.9848642 , -2.713126  , ..., -0.8856256 ,
         -0.53133416, -0.7989203 ],
        [-3.8264012 , -3.278942  , -2.9650037 , ..., -0.65680426,
         -0.5076165 , -0.9370476 ]], dtype=float32)}