In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/husein/t5/prepare/mesolitica-tpu.json'

In [2]:
import malaya_speech.train.model.conformer as conformer
import malaya_speech.train.model.transducer as transducer
import malaya_speech
import tensorflow as tf
import numpy as np
import json
from glob import glob
from sklearn.utils import shuffle

In [3]:
import string

char_vocabs = [''] + list(string.ascii_lowercase + string.digits) + [' ']
subwords_malay = malaya_speech.subword.load('bahasa-512.subword')
subwords_singlish = malaya_speech.subword.load('singlish-512.subword')
langs = [subwords_malay, subwords_singlish]
len_vocab = [l.vocab_size for l in langs]
config = malaya_speech.config.conformer_base_encoder_config

In [4]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)

In [5]:
def get_subwords(ids, lang):
    lang = lang[0]
    text = ''.join([char_vocabs[c] for c in ids])
    t = malaya_speech.subword.encode(
        langs[lang], text, add_blank=False
    )
    t = np.array(t) + sum(len_vocab[:lang])
    return t.astype(np.int32)


def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(mel_fbanks)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = mel_fbanks
    example['inputs_length'] = length

    t = tf.compat.v1.numpy_function(get_subwords, [example['targets'], example['lang']], tf.int32)
    t = tf.reshape(t, (-1,))
    example['targets'] = t
    length = tf.cast(tf.shape(t)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['targets_length'] = length

    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
        'lang': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'inputs_length', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

def pop(features):
    features.pop('waveforms', None)
    return features

In [6]:
n_mels = 80
sr = 16000
maxlen = 18
minlen_text = 1


def get_dataset(
    files,
    batch_size = 3,
    shuffle_size = 20,
    thread_count = 24,
    maxlen_feature = 1800,
    num_cpu_threads=6,
):
    def get():
        d = tf.data.Dataset.from_tensor_slices(tf.constant(files))
        cycle_length = min(num_cpu_threads, len(files))
        d = d.interleave(
            tf.data.TFRecordDataset,
            cycle_length=cycle_length,
            block_length=thread_count)
        d = d.map(parse, num_parallel_calls=thread_count)
        d = d.map(pop, num_parallel_calls=thread_count)
        d = d.padded_batch(
            batch_size,
            padded_shapes={
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            padding_values={
                'inputs': tf.constant(0, dtype=tf.float32),
                'inputs_length': tf.constant(0, dtype=tf.int32),
                'targets': tf.constant(0, dtype=tf.int32),
                'targets_length': tf.constant(0, dtype=tf.int32),
            },
        )
        return d

    return get

In [7]:
with open('2mixed-train-test.json') as fopen:
    dataset = json.load(fopen)

In [8]:
dev_dataset = get_dataset(dataset['test'])()

In [9]:
features = dev_dataset.make_one_shot_iterator().get_next()
features

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


{'targets': <tf.Tensor 'IteratorGetNext:2' shape=(?, ?) dtype=int32>,
 'targets_length': <tf.Tensor 'IteratorGetNext:3' shape=(?, ?) dtype=int32>,
 'inputs': <tf.Tensor 'IteratorGetNext:0' shape=(?, ?, 80) dtype=float32>,
 'inputs_length': <tf.Tensor 'IteratorGetNext:1' shape=(?, ?) dtype=int32>}

In [10]:
training = True

In [11]:
config = malaya_speech.config.conformer_base_encoder_config
config['dropout'] = 0.0
conformer_model = conformer.Model(
    kernel_regularizer = None, bias_regularizer = None, **config
)
decoder_config = malaya_speech.config.conformer_base_decoder_config
decoder_config['embed_dropout'] = 0.0
transducer_model = transducer.rnn.Model(
    conformer_model, vocabulary_size = sum(len_vocab), **decoder_config
)
targets_length = features['targets_length'][:, 0]
v = tf.expand_dims(features['inputs'], -1)
z = tf.zeros((tf.shape(features['targets'])[0], 1), dtype = tf.int32)
c = tf.concat([z, features['targets']], axis = 1)

logits = transducer_model([v, c, targets_length + 1], training = training)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [12]:
decoded = transducer_model.greedy_decoder(v, features['inputs_length'][:, 0], training = training)

In [13]:
decoded = decoded[0]

In [14]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'asr-base-conformer-transducer-2mixed/model.ckpt-1125000')

INFO:tensorflow:Restoring parameters from asr-base-conformer-transducer-2mixed/model.ckpt-1125000


In [15]:
import re
from malaya_speech.utils.subword import decode

def decode_multilanguage(row, langs):
    
    if not len(row):
        return ''

    len_vocab = [l.vocab_size for l in langs]

    def get_index_multilanguage(r):
        for i in range(len(langs)):
            sum_v = sum(len_vocab[:i + 1])
            if r < sum(len_vocab[:i + 1]):
                return i, r - sum(len_vocab[:i])

    last_index, v = get_index_multilanguage(row[0])
    d, q = [], [v]
    for r in row[1:]:
        index, v = get_index_multilanguage(r)
        if index != last_index:
            d.append(decode(langs[last_index], q))
            q = [v]
            last_index = index
        else:
            q.append(v)
    if len(q):
        d.append(decode(langs[last_index], q))
    d = re.sub(r'[ ]+', ' ', ' '.join(d)).strip()
    d = d.replace(' lah', 'lah')
    return d

In [16]:
wer, cer = [], []
pairs = []
index = 0
while True:
    try:
        r = sess.run([decoded, features['targets']])
        for no, row in enumerate(r[0]):
            try:
                d = decode_multilanguage(row[row > 0], langs)
                t = decode_multilanguage(r[1][no], langs)
                wer.append(malaya_speech.metrics.calculate_wer(t, d))
                cer.append(malaya_speech.metrics.calculate_cer(t, d))
                pairs.append((d, t))
            except Exception as e:
                print('inside', e)
        index += 1
    except Exception as e:
        print(e)
        break

End of sequence
	 [[node IteratorGetNext (defined at /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'IteratorGetNext':
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/husein/.local/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/husein/.local/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/husein/.local/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 563, in start
    self.io_loop.start()
  File "/home/husein/.local/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._r

In [17]:
np.mean(wer), np.mean(cer)

(0.137671741927627, 0.07175076849988724)

In [18]:
for no, row in enumerate(r[0]):
    try:
        d = decode_multilanguage(row[row > 0], langs)
        t = decode_multilanguage(r[1][no], langs)
        print(d)
        print(t)
        print()
    except:
        pass

at the heart of this is the question of why people arent voluntarily stepping up
at the heart of this is the question of why people arent voluntarily stepping up



In [19]:
pairs

[('tangi aku recently lambo', 'tangan aku disentuh lembut'),
 ('sampaikan ada orang yang menegur aku dengan bahasa kasarnya don t khuneo it is seen to be passivitic',
  'sampai kan ada orang yang menegur aku dengan bahasa kasarnya don t you know it is a sin to be pessimistic'),
 ('kenapa tidak kita mengusir ob', 'kenapa tidak kita mengusir obummer'),
 ('sama kalau kekasih kau ikut jantan lain kan seven hantaran 20 kau tu',
  'sama kalau kekasih kau ikut jantan lain kan saving duit hantaran 20k kau tu'),
 ('laki laki bernama anda itu segera mengalirkan pandangannya kelip kemudian',
  'laki laki bernama andra itu segera mengalihkan pandangannya ke libra kemudian'),
 ('kenapa saya berurusan dengan aktif internet biasanya meninggalkan keputusan yang tidak baik',
  'kenapa saya berurusan dengan atheis internet biasanya meninggalkan keputusan yang tidak baik'),
 ('dan perlukan duit untuk operation dalam kadar segera',
  'dan perlukan duit untuk operation dalam kadar segera'),
 ('tokong adala