In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import malaya_speech.train.model.jasper as jasper
import malaya_speech
import tensorflow as tf
import numpy as np
import json






The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




In [3]:
with open('malaya-speech-sst-vocab.json') as fopen:
    unique_vocab = json.load(fopen) + ['{', '}', '[']

In [4]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)

In [5]:
X = tf.placeholder(tf.float32, [None, None])
X_len = tf.placeholder(tf.int32, [None])

In [6]:
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = featurizer(X[i, :X_len[i]])
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features)

def condition(i, padded_features):
    return i < batch_size

def body(i, padded_features):
    f = features.read(i)
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features.write(i, f)

_, padded_features = tf.while_loop(condition, body, init_state)
padded_features = padded_features.stack()
padded_features.set_shape((None, None, 80))

In [7]:
model = jasper.Model(padded_features, features_len, training = False)
logits = tf.layers.dense(model.logits['outputs'], len(unique_vocab) + 1)
seq_lens = model.logits['src_length']
logits = tf.transpose(logits, [1, 0, 2])
logits = tf.identity(logits, name = 'logits')
seq_lens = tf.identity(seq_lens, name = 'seq_lens')


Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not be used (consult the `tf.keras.layers.batch_normalization` documentation).
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.Dense instead.


In [8]:
decoded = tf.nn.ctc_beam_search_decoder(logits, seq_lens, beam_width=100, top_paths=1, merge_repeated=True)
preds = tf.sparse.to_dense(tf.to_int32(decoded[0][0]))
preds = tf.identity(preds, 'preds')

Instructions for updating:
Use `tf.cast` instead.


In [9]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [10]:
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'asr-jasper-ctc/model.ckpt-120000')

INFO:tensorflow:Restoring parameters from asr-jasper-ctc/model.ckpt-120000


In [11]:
files = [
    'savewav_2020-11-26_22-36-06_294832.wav',
    'savewav_2020-11-26_22-40-56_929661.wav',
    'download.wav',
    'husein-zolkepli.wav',
    'mas-aisyah.wav',
    'khalil-nooh.wav',
    'wadi-annuar.wav',
    '675.wav',
    '664.wav',
    'shafiqah-idayu.wav'
]

ys = [malaya_speech.load(f)[0] for f in files]

In [12]:
padded, lens = malaya_speech.padding.sequence_1d(ys, return_len = True)
decoded = sess.run(preds, feed_dict = {X: padded, X_len: lens})

In [13]:
results = []
for i in range(len(decoded)):
    results.append(malaya_speech.char.decode(decoded[i], lookup = unique_vocab).replace('<PAD>', ''))
results

['elo nama saya sin saya tak suka mandi kata saya masan',
 'elonam saya musin saya suka mandi saya mandi kitiap hari',
 'lo aelasalamualaikum h du tentu awarsianal sekolah malaysia',
 'testing nama saya musin binza kply',
 'sebut perkatan angka',
 'tolong sebut anti kata',
 'jadi dalam perjalanan ini dunia yang susah ini ketika nabi mengajar muadzkin jabar tadi ni alah maini',
 'ini dan melalui kenyatan mesej itu mastura menegaskan',
 'pilihan tepat apabila dia kini lebih berani dan',
 'naman saya safkahi dayu']

In [14]:
saver = tf.train.Saver()
saver.save(sess, 'asr-jasper-ctc-output/model.ckpt')

'asr-jasper-ctc-output/model.ckpt'

In [15]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 't_logits' in n.name
        or 'seq_lens' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'w2l_encoder/conv11/kernel',
 'w2l_encoder/conv11/bn/gamma',
 'w2l_encoder/conv11/bn/moving_mean',
 'w2l_encoder/conv11/bn/moving_variance',
 'w2l_encoder/conv21/kernel',
 'w2l_encoder/conv21/bn/gamma',
 'w2l_encoder/conv21/bn/moving_mean',
 'w2l_encoder/conv21/bn/moving_variance',
 'w2l_encoder/conv22/kernel',
 'w2l_encoder/conv22/bn/gamma',
 'w2l_encoder/conv22/bn/moving_mean',
 'w2l_encoder/conv22/bn/moving_variance',
 'w2l_encoder/conv23/kernel',
 'w2l_encoder/conv23/bn/gamma',
 'w2l_encoder/conv23/bn/moving_mean',
 'w2l_encoder/conv23/bn/moving_variance',
 'w2l_encoder/conv24/kernel',
 'w2l_encoder/conv24/bn/gamma',
 'w2l_encoder/conv24/bn/moving_mean',
 'w2l_encoder/conv24/bn/moving_variance',
 'w2l_encoder/conv25/res/kernel',
 'w2l_encoder/conv25/res_bn/gamma',
 'w2l_encoder/conv25/res_bn/moving_mean',
 'w2l_encoder/conv25/res_bn/moving_variance',
 'w2l_encoder/conv25/kernel',
 'w2l_encoder/conv25/bn/gamma',
 'w2l_encoder/conv25/bn/moving_mean'

In [16]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [17]:
freeze_graph('asr-jasper-ctc-output', strings)

INFO:tensorflow:Restoring parameters from asr-jasper-ctc-output/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 317 variables.
INFO:tensorflow:Converted 317 variables to const ops.
2227 ops in the final graph.


In [18]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [19]:
g = load_graph('asr-jasper-ctc-output/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
x_lens = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
seq_lens = g.get_tensor_by_name('import/seq_lens:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run([logits, seq_lens], feed_dict = {x: padded, x_lens: lens})
result



[array([[[-3.3292957e+01, -3.2445419e+01, -3.7555838e+00, ...,
          -3.2113750e+01, -3.2316597e+01,  5.0320716e+00],
         [-4.6810215e+01, -4.6434803e+01, -2.1730406e+00, ...,
          -4.5917953e+01, -4.7858700e+01,  1.2713997e+01],
         [-5.9905017e+02, -6.3005969e+02, -2.8487329e+01, ...,
          -6.2107782e+02, -6.5952893e+02,  1.8589340e+02],
         ...,
         [-9.0203941e+01, -8.6267799e+01, -1.8135012e+01, ...,
          -8.5835487e+01, -8.9654182e+01,  2.2634726e+01],
         [-6.2843258e+01, -6.2855827e+01, -4.3905849e+00, ...,
          -6.2142593e+01, -6.5919212e+01,  2.0061319e+01],
         [-6.2147823e+01, -6.2112484e+01, -9.0273991e+00, ...,
          -6.0437145e+01, -6.3704704e+01,  1.9047682e+01]],
 
        [[-3.0000391e+01, -2.9715731e+01,  5.9763722e+00, ...,
          -2.9776182e+01, -2.9387861e+01,  5.3286834e+00],
         [-4.4422771e+01, -4.3698792e+01, -6.3771725e-01, ...,
          -4.4004436e+01, -4.5599876e+01,  1.1608844e+01],
       

In [20]:
from tensorflow.tools.graph_transforms import TransformGraph

In [21]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

pb = 'asr-jasper-ctc-output/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           ['Placeholder', 'Placeholder_1'],
                                           ['logits', 'seq_lens'], transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [22]:
g = load_graph(f'{pb}.quantized')
x = g.get_tensor_by_name('import/Placeholder:0')
x_lens = g.get_tensor_by_name('import/Placeholder_1:0')
logits = g.get_tensor_by_name('import/logits:0')
seq_lens = g.get_tensor_by_name('import/seq_lens:0')
test_sess = tf.InteractiveSession(graph = g)
result = test_sess.run([logits, seq_lens], feed_dict = {x: padded, x_lens: lens})
result

[array([[[-3.3248798e+01, -3.2405487e+01, -3.6843336e+00, ...,
          -3.2103840e+01, -3.2358120e+01,  5.0366917e+00],
         [-4.6723331e+01, -4.6406281e+01, -2.1444829e+00, ...,
          -4.5919094e+01, -4.7721859e+01,  1.2709956e+01],
         [-5.6169916e+02, -5.8984106e+02, -3.6937653e+01, ...,
          -5.7866638e+02, -6.2131317e+02,  1.7969835e+02],
         ...,
         [-8.9468178e+01, -8.6062126e+01, -1.8182365e+01, ...,
          -8.5407333e+01, -8.9305550e+01,  2.2751455e+01],
         [-6.3474026e+01, -6.3783276e+01, -4.3441787e+00, ...,
          -6.2886776e+01, -6.6856819e+01,  2.0204243e+01],
         [-6.2643658e+01, -6.2713943e+01, -9.0655861e+00, ...,
          -6.1027390e+01, -6.4340340e+01,  1.9271667e+01]],
 
        [[-2.9900059e+01, -2.9717604e+01,  6.1208420e+00, ...,
          -2.9752611e+01, -2.9377031e+01,  5.3496475e+00],
         [-4.4413883e+01, -4.3682888e+01, -5.0062120e-01, ...,
          -4.4031059e+01, -4.5550652e+01,  1.1634651e+01],
       