In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import malaya_speech.train.model.alconformer as conformer
import malaya_speech.train.model.transducer as transducer
import malaya_speech
import tensorflow as tf
import numpy as np
import json
from glob import glob






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




In [3]:
subwords = malaya_speech.subword.load('malaya-speech.tokenizer')

In [4]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)

In [5]:
with open('malaya-speech-sst-vocab.json') as fopen:
    unique_vocab = json.load(fopen) + ['{', '}', '[']

In [6]:
n_mels = 80

def preprocess_inputs(example):
    s = featurizer.vectorize(example['waveforms'])
    mel_fbanks = tf.reshape(s, (-1, n_mels))
    length = tf.cast(tf.shape(mel_fbanks)[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['inputs'] = mel_fbanks
    example['inputs_length'] = length

    return example

def parse(serialized_example):

    data_fields = {
        'waveforms': tf.VarLenFeature(tf.float32),
        'targets': tf.VarLenFeature(tf.int64),
    }
    features = tf.parse_single_example(
        serialized_example, features = data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'inputs', 'inputs_length', 'targets']:
            features.pop(k, None)

    return features

def get_dataset(
    path,
    batch_size = 32,
    thread_count = 24,
):
    def get():
        files = glob(path)
        dataset = tf.data.TFRecordDataset(files)
        dataset = dataset.map(parse, num_parallel_calls = thread_count)
        dataset = dataset.padded_batch(
            batch_size,
            padded_shapes = {
                'waveforms': tf.TensorShape([None]),
                'inputs': tf.TensorShape([None, n_mels]),
                'inputs_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
            },
            padding_values = {
                'waveforms': tf.constant(0, dtype = tf.float32),
                'inputs': tf.constant(0, dtype = tf.float32),
                'inputs_length': tf.constant(0, dtype = tf.int32),
                'targets': tf.constant(0, dtype = tf.int64),
            },
        )
        return dataset

    return get

dev_dataset = get_dataset(
    '../speech-bahasa/bahasa-asr-test/data/bahasa-asr-dev-*'
)()





In [7]:
iterator = dev_dataset.make_one_shot_iterator().get_next()
iterator

Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_one_shot_iterator(dataset)`.


{'targets': <tf.Tensor 'IteratorGetNext:2' shape=(?, ?) dtype=int64>,
 'waveforms': <tf.Tensor 'IteratorGetNext:3' shape=(?, ?) dtype=float32>,
 'inputs': <tf.Tensor 'IteratorGetNext:0' shape=(?, ?, 80) dtype=float32>,
 'inputs_length': <tf.Tensor 'IteratorGetNext:1' shape=(?, ?) dtype=int32>}

In [8]:
tf.expand_dims(iterator['inputs'], -1)

<tf.Tensor 'ExpandDims:0' shape=(?, ?, 80, 1) dtype=float32>

In [9]:
config = malaya_speech.config.conformer_base_encoder_config
conformer_model = conformer.Model(**config)
decoder_config = malaya_speech.config.conformer_base_decoder_config
transducer_model = transducer.rnn.Model(
    conformer_model, vocabulary_size = subwords.vocab_size, **decoder_config
)

In [10]:
i = tf.placeholder(tf.float32, [None, None, 80])
expand = tf.expand_dims(i, -1)

In [11]:
p = tf.placeholder(tf.int32, [None, None])
z = tf.zeros((tf.shape(p)[0], 1),dtype=tf.int32)
c = tf.concat([z, p], axis = 1)
c

<tf.Tensor 'concat:0' shape=(?, ?) dtype=int32>

In [12]:
logits = transducer_model([expand, c], training = True)
logits

Instructions for updating:
If using Keras pass *_constraint arguments to layers.



<tf.Tensor 'transducer/transducer_joint/transducer_joint_vocab/BiasAdd:0' shape=(?, ?, ?, 1019) dtype=float32>

In [13]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [14]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'asr-base-alconformer-transducer/model.ckpt-130000')

INFO:tensorflow:Restoring parameters from asr-base-alconformer-transducer/model.ckpt-130000


In [15]:
encoded = transducer_model.encoder(tf.expand_dims(iterator['inputs'], -1), training = True)
encoded = tf.identity(encoded, name = 'encoded')

In [16]:
encoded_placeholder = tf.placeholder(tf.float32, [config['dmodel']], name = 'encoded_placeholder')
predicted_placeholder = tf.placeholder(tf.int32, None, name = 'predicted_placeholder')
t = transducer_model.predict_net.get_initial_state().shape
states_placeholder = tf.placeholder(tf.float32, [int(i) for i in t], name = 'states_placeholder')

ytu, new_states = transducer_model.decoder_inference(
    encoded=encoded_placeholder,
    predicted=predicted_placeholder,
    states=states_placeholder,
    training = True
)
ytu = tf.identity(ytu, name = 'ytu')
new_states = tf.identity(new_states, name = 'new_states')
ytu, new_states

(<tf.Tensor 'ytu:0' shape=(1019,) dtype=float32>,
 <tf.Tensor 'new_states:0' shape=(1, 2, 1, 640) dtype=float32>)

In [17]:
initial_states = transducer_model.predict_net.get_initial_state()
initial_states = tf.identity(initial_states, name = 'initial_states')

In [18]:
import collections
import numpy as np
import tensorflow as tf

BeamHypothesis = collections.namedtuple(
    'BeamHypothesis', ('score', 'prediction', 'states')
)


def transducer(
    enc,
    total,
    initial_states,
    encoded_placeholder,
    predicted_placeholder,
    states_placeholder,
    ytu,
    new_states,
    sess,
    beam_width = 10,
    norm_score = True,
):
    kept_hyps = [
        BeamHypothesis(score = 0.0, prediction = [0], states = initial_states)
    ]
    B = kept_hyps
    for i in range(total):
        A = B
        B = []
        while True:
            y_hat = max(A, key = lambda x: x.score)
            A.remove(y_hat)
            ytu_, new_states_ = sess.run(
                [ytu, new_states],
                feed_dict = {
                    encoded_placeholder: enc[i],
                    predicted_placeholder: y_hat.prediction[-1],
                    states_placeholder: y_hat.states,
                },
            )
            for k in range(ytu_.shape[0]):
                beam_hyp = BeamHypothesis(
                    score = (y_hat.score + float(ytu_[k])),
                    prediction = y_hat.prediction,
                    states = y_hat.states,
                )
                if k == 0:
                    B.append(beam_hyp)
                else:
                    beam_hyp = BeamHypothesis(
                        score = beam_hyp.score,
                        prediction = (beam_hyp.prediction + [int(k)]),
                        states = new_states_,
                    )
                    A.append(beam_hyp)
            if len(B) > beam_width:
                break
    if norm_score:
        kept_hyps = sorted(
            B, key = lambda x: x.score / len(x.prediction), reverse = True
        )[:beam_width]
    else:
        kept_hyps = sorted(B, key = lambda x: x.score, reverse = True)[
            :beam_width
        ]
    return kept_hyps[0].prediction

In [19]:
from tqdm import tqdm

wer, cer, index = [], [], 0
while True:
    try:
        encoded_, f  = sess.run([encoded, iterator])
        padded_lens_ = f['inputs_length'][:,0] // conformer_model.conv_subsampling.time_reduction_factor
        s = sess.run(initial_states)
        
        results = []
        for i in tqdm(range(len(encoded_))):
            r = transducer(
                enc = encoded_[i],
                total = padded_lens_[i],
                initial_states = s,
                encoded_placeholder = encoded_placeholder,
                predicted_placeholder = predicted_placeholder,
                states_placeholder = states_placeholder,
                ytu = ytu,
                new_states = new_states,
                sess = sess,
                beam_width = 1,
            )
            results.append(malaya_speech.subword.decode(subwords, r))
            
        for i in range(len(results)):
            d = results[i]
            t = malaya_speech.char.decode(f['targets'][i], lookup = unique_vocab).replace('<PAD>', '')
            wer.append(malaya_speech.metrics.calculate_wer(t, d))
            cer.append(malaya_speech.metrics.calculate_cer(t, d))
        print(f'loop {index}')
        index += 1
    except:
        break

100%|██████████| 32/32 [00:49<00:00,  1.55s/it]


loop 0


100%|██████████| 32/32 [00:46<00:00,  1.45s/it]


loop 1


100%|██████████| 32/32 [00:46<00:00,  1.46s/it]


loop 2


100%|██████████| 32/32 [00:54<00:00,  1.70s/it]


loop 3


100%|██████████| 32/32 [00:46<00:00,  1.46s/it]


loop 4


100%|██████████| 32/32 [00:46<00:00,  1.46s/it]


loop 5


100%|██████████| 32/32 [00:47<00:00,  1.48s/it]


loop 6


100%|██████████| 32/32 [00:44<00:00,  1.40s/it]


loop 7


100%|██████████| 32/32 [00:44<00:00,  1.40s/it]


loop 8


100%|██████████| 32/32 [00:50<00:00,  1.59s/it]


loop 9


100%|██████████| 32/32 [00:50<00:00,  1.57s/it]


loop 10


100%|██████████| 32/32 [00:46<00:00,  1.44s/it]


loop 11


100%|██████████| 32/32 [00:48<00:00,  1.53s/it]


loop 12


100%|██████████| 32/32 [00:44<00:00,  1.40s/it]


loop 13


100%|██████████| 32/32 [00:40<00:00,  1.25s/it]


loop 14


100%|██████████| 32/32 [00:51<00:00,  1.60s/it]


loop 15


100%|██████████| 31/31 [00:39<00:00,  1.28s/it]

loop 16





In [22]:
np.mean(wer), np.mean(cer)

(0.30567287550582595, 0.12267994291449753)

In [23]:
results

['assalamualaikum siapa pernah kena tinggi',
 'kamu sihatkan',
 'meruny ibrahkan segera keluar dari ruangan mawi',
 'kebanyakannya pertama di tepi tepi yang berhampiran dengan telinganya sudah mulai putih',
 'time dan isub menerbitkan laporan muke depan yang menggambarkan presiden itu sedang hanyut dan tidak mampu',
 'bergaul serapan semua serius kadang-kadang bergaduh gaduh sehebat itu',
 'karena tahu laki-laki itu adalah kak kelas',
 'dan tiap kali dia hendak menerangkannya kepada nahi dah',
 'jika kabinet yang mantap dapat dibentuk masa depan mereka berdua cerah',
 'kayanya sudah cukup aku di sekepit ini masakan abang ke',
 'tak saja kan gitu pak guru memang betullah',
 'menggalakkan usaha baru dan dari cipta kebanyakan enjoy yang dibentuk oleh graduan graduan muda pesantren biasanya dengan bantuan ulama progresif',
 'nenek suka naik kereta meta',
 'aku takut segala-galanya',
 'mengingatkan seni',
 'hatinya dan terus membaca kalimat selanjutnya',
 'indonesia dan media antarabangsa s