In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import malaya_speech.train.model.conformer as conformer
import malaya_speech.train.model.transducer as transducer
import malaya_speech
import tensorflow as tf
import numpy as np

In [3]:
subwords_malay = malaya_speech.subword.load('bahasa-512.subword')
subwords_singlish = malaya_speech.subword.load('singlish-512.subword')
subwords_mandarin = malaya_speech.subword.load('mandarin-512.subword')
langs = [subwords_malay, subwords_singlish, subwords_mandarin]
len_vocab = [l.vocab_size for l in langs]

In [4]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)

In [5]:
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')

In [6]:
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = featurizer(X[i, :X_len[i]])
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features, padded_lens)

def condition(i, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features, padded_lens):
    f = features.read(i)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features.set_shape((None, None, 80))
padded_features = tf.expand_dims(padded_features, -1)
padded_features, padded_lens

(<tf.Tensor 'ExpandDims:0' shape=(?, ?, 80, 1) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [7]:
padded_features = tf.identity(padded_features, name = 'padded_features')
padded_lens = tf.identity(padded_lens, name = 'padded_lens')

In [8]:
config = malaya_speech.config.conformer_base_encoder_config
config['dropout'] = 0.0
conformer_model = conformer.Model(**config)
decoder_config = malaya_speech.config.conformer_base_decoder_config
decoder_config['embed_dropout'] = 0.0
transducer_model = transducer.rnn.Model(
    conformer_model, vocabulary_size = sum(len_vocab), **decoder_config
)

In [9]:
p = tf.compat.v1.placeholder(tf.int32, [None, None])
z = tf.zeros((tf.shape(p)[0], 1),dtype=tf.int32)
c = tf.concat([z, p], axis = 1)
p_len = tf.compat.v1.placeholder(tf.int32, [None])
c

<tf.Tensor 'concat:0' shape=(?, ?) dtype=int32>

In [10]:
training = True

In [11]:
logits = transducer_model([padded_features, c, p_len], training = training)
logits

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


<tf.Tensor 'transducer/transducer_joint/transducer_joint_vocab/BiasAdd:0' shape=(?, ?, ?, 1543) dtype=float32>

In [12]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [13]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'asr-base-conformer-transducer-3mixed/model.ckpt-1625000')

INFO:tensorflow:Restoring parameters from asr-base-conformer-transducer-3mixed/model.ckpt-1625000


In [14]:
decoded = transducer_model.greedy_decoder(padded_features, padded_lens, training = training)
decoded = tf.identity(decoded[0], name = 'greedy_decoder')
decoded

<tf.Tensor 'greedy_decoder:0' shape=(?, ?) dtype=int32>

In [15]:
encoded = transducer_model.encoder(padded_features, training = training)
encoded = tf.identity(encoded, name = 'encoded')
encoded_placeholder = tf.placeholder(tf.float32, [config['dmodel']], name = 'encoded_placeholder')
predicted_placeholder = tf.placeholder(tf.int32, None, name = 'predicted_placeholder')
t = transducer_model.predict_net.get_initial_state().shape
states_placeholder = tf.placeholder(tf.float32, [int(i) for i in t], name = 'states_placeholder')

ytu, new_states = transducer_model.decoder_inference(
    encoded=encoded_placeholder,
    predicted=predicted_placeholder,
    states=states_placeholder,
    training = training
)

ytu = tf.identity(ytu, name = 'ytu')
new_states = tf.identity(new_states, name = 'new_states')
ytu, new_states

(<tf.Tensor 'ytu:0' shape=(1543,) dtype=float32>,
 <tf.Tensor 'new_states:0' shape=(1, 2, 1, 640) dtype=float32>)

In [16]:
initial_states = transducer_model.predict_net.get_initial_state()
initial_states = tf.identity(initial_states, name = 'initial_states')

In [17]:
# sess = tf.Session()
# sess.run(tf.global_variables_initializer())

In [18]:
# var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
# saver = tf.train.Saver(var_list = var_list)
# saver.restore(sess, 'asr-small-conformer-transducer/model.ckpt-325000')

In [19]:
files = [
    'speech/record/savewav_2020-11-26_22-36-06_294832.wav',
    'speech/record/savewav_2020-11-26_22-40-56_929661.wav',
    'speech/record/675.wav',
    'speech/record/664.wav',
    'mandarin-test/597.wav',
    'mandarin-test/584.wav',
    'speech/example-speaker/husein-zolkepli.wav',
    'speech/example-speaker/mas-aisyah.wav',
    'speech/example-speaker/khalil-nooh.wav',
    'speech/example-speaker/shafiqah-idayu.wav',
    'speech/khutbah/wadi-annuar.wav',
    'singlish0.wav',
    'singlish1.wav',
    'singlish2.wav',
    'singlish3.wav',
    'singlish4.wav'
]

front_pad = 200
back_pad = 2000
inputs = [malaya_speech.load(f)[0] for f in files]
padded, lens = malaya_speech.padding.sequence_1d(inputs, return_len = True)
back = np.zeros(shape = (len(inputs), back_pad))
front = np.zeros(shape = (len(inputs), front_pad))
padded = np.concatenate([front, padded, back], axis = -1)
lens = [l + front_pad + back_pad for l in lens]

In [20]:
# import collections
# import numpy as np
# import tensorflow as tf

# BeamHypothesis = collections.namedtuple(
#     'BeamHypothesis', ('score', 'prediction', 'states')
# )


# def transducer(
#     enc,
#     total,
#     initial_states,
#     encoded_placeholder,
#     predicted_placeholder,
#     states_placeholder,
#     ytu,
#     new_states,
#     sess,
#     beam_width = 10,
#     norm_score = True,
# ):
#     kept_hyps = [
#         BeamHypothesis(score = 0.0, prediction = [0], states = initial_states)
#     ]
#     B = kept_hyps
#     for i in range(total):
#         A = B
#         B = []
#         while True:
#             y_hat = max(A, key = lambda x: x.score)
#             A.remove(y_hat)
#             ytu_, new_states_ = sess.run(
#                 [ytu, new_states],
#                 feed_dict = {
#                     encoded_placeholder: enc[i],
#                     predicted_placeholder: y_hat.prediction[-1],
#                     states_placeholder: y_hat.states,
#                 },
#             )
#             for k in range(ytu_.shape[0]):
#                 beam_hyp = BeamHypothesis(
#                     score = (y_hat.score + float(ytu_[k])),
#                     prediction = y_hat.prediction,
#                     states = y_hat.states,
#                 )
#                 if k == 0:
#                     B.append(beam_hyp)
#                 else:
#                     beam_hyp = BeamHypothesis(
#                         score = beam_hyp.score,
#                         prediction = (beam_hyp.prediction + [int(k)]),
#                         states = new_states_,
#                     )
#                     A.append(beam_hyp)
#             if len(B) > beam_width:
#                 break
#     if norm_score:
#         kept_hyps = sorted(
#             B, key = lambda x: x.score / len(x.prediction), reverse = True
#         )[:beam_width]
#     else:
#         kept_hyps = sorted(B, key = lambda x: x.score, reverse = True)[
#             :beam_width
#         ]
#     return kept_hyps[0].prediction

In [21]:
import re
from malaya_speech.utils.subword import decode

def decode_multilanguage(row, langs):
    
    if not len(row):
        return ''

    len_vocab = [l.vocab_size for l in langs]

    def get_index_multilanguage(r):
        for i in range(len(langs)):
            sum_v = sum(len_vocab[:i + 1])
            if r < sum(len_vocab[:i + 1]):
                return i, r - sum(len_vocab[:i])

    last_index, v = get_index_multilanguage(row[0])
    d, q = [], [v]
    for r in row[1:]:
        index, v = get_index_multilanguage(r)
        if index != last_index:
            d.append(decode(langs[last_index], q))
            q = [v]
            last_index = index
        else:
            q.append(v)
    if len(q):
        d.append(decode(langs[last_index], q))
    d = re.sub(r'[ ]+', ' ', ' '.join(d)).strip()
    d = d.replace(' lah', 'lah')
    return d

In [22]:
%%time

r = sess.run(decoded, feed_dict = {X: padded, X_len: lens})
for row in r:
    print(decode_multilanguage(row[row > 0], langs))

kalau nama saya musim saya tak suka mandi kata saya masak
hello sorry sorry so
ini melalui kementerian mesin itu struktu
pilihan tepat apabila dia kini lebih berani dan
gei wo lai ge zhang jie zui xin de ge
wo xiang ting su bing dao de jie mu
testing number would say being a caply
uncle
dollar anti
nama saya tak hidaya
jadi dalam perjalanan ini ini yang susah ini ketika nabi mengajar muaz bin jabal tadi ni allah
and then see how they bring and film okay actually
later to your s
seven seven more

then after that she brought the living some month
CPU times: user 44.9 s, sys: 11.1 s, total: 56 s
Wall time: 6.07 s


In [23]:
# %%time

# encoded_, padded_lens_  = sess.run([encoded, padded_lens], feed_dict = {X: padded, X_len: lens})
# padded_lens_ = padded_lens_ // conformer_model.conv_subsampling.time_reduction_factor
# s = sess.run(initial_states)

# for i in range(len(encoded_)):
#     r = transducer(
#         enc = encoded_[i],
#         total = padded_lens_[i],
#         initial_states = s,
#         encoded_placeholder = encoded_placeholder,
#         predicted_placeholder = predicted_placeholder,
#         states_placeholder = states_placeholder,
#         ytu = ytu,
#         new_states = new_states,
#         sess = sess,
#         beam_width = 1,
#     )
    
#     print(decode_multilanguage(r, langs))

In [24]:
l = padded_lens // transducer_model.encoder.conv_subsampling.time_reduction_factor
encoded = transducer_model.encoder(padded_features, training = training)
g = transducer_model._perform_greedy(encoded[0], l[0],
                                tf.constant(0, dtype = tf.int32),
                                transducer_model.predict_net.get_initial_state())
g

Hypothesis(index=<tf.Tensor 'while_3/Exit_1:0' shape=() dtype=int32>, prediction=<tf.Tensor 'TensorArrayStack_3/TensorArrayGatherV3:0' shape=(?,) dtype=int32>, states=<tf.Tensor 'while_3/Exit_3:0' shape=(1, 2, 1, 640) dtype=float32>, alignment=<tf.Tensor 'TensorArrayStack_4/TensorArrayGatherV3:0' shape=(?, 1543) dtype=float32>)

In [25]:
indices = g.prediction
minus_one = -1 * tf.ones_like(indices, dtype=tf.int32)
blank_like = 0 * tf.ones_like(indices, dtype=tf.int32)
indices = tf.where(indices == minus_one, blank_like, indices)
num_samples = tf.cast(X_len[0], dtype=tf.float32)
total_time_reduction_factor = featurizer.frame_step
stime = tf.range(0, num_samples, delta=total_time_reduction_factor, dtype=tf.float32)
stime /= tf.cast(featurizer.sample_rate, dtype=tf.float32)
stime = stime[::tf.shape(stime)[0] // tf.shape(indices)[0]]
stime.set_shape((None,))
non_blank = tf.where(tf.not_equal(indices, 0))
non_blank_transcript = tf.gather_nd(indices, non_blank)
non_blank_stime = tf.gather_nd(stime, non_blank)
non_blank_transcript = tf.identity(non_blank_transcript, name = 'non_blank_transcript')
non_blank_stime = tf.identity(non_blank_stime, name = 'non_blank_stime')

In [26]:
%%time

r = sess.run([non_blank_transcript, non_blank_stime], feed_dict = {X: padded, X_len: lens})

CPU times: user 22.9 s, sys: 2 s, total: 24.9 s
Wall time: 5 s


In [27]:
r

[array([ 74, 139, 230,   1,  65,   1,  54,  42, 123,  65,   1,  10, 290,
         32,  17, 290,  58,  52,  41,   1,  65,   1,  78, 355, 365],
       dtype=int32),
 array([0.56     , 0.68     , 1.2800001, 1.4000001, 1.5200001, 1.7600001,
        1.96     , 2.0400002, 2.2800002, 3.1200001, 3.2400002, 3.3200002,
        3.44     , 3.5200002, 3.6000001, 3.68     , 3.7200003, 3.88     ,
        5.1200004, 5.1600003, 5.32     , 5.5200005, 5.7200003, 5.92     ,
        5.9600005], dtype=float32)]

In [28]:
def decode_multilanguage_subwords(tokenizers, ids):
    """
    Decode integer representation to string using list of tokenizer objects.

    Parameters
    -----------
    tokenizers: List[object]
        List of tokenizer objects.
    ids: List[int]

    Returns
    --------
    result: str
    """

    if not len(ids):
        return ''

    len_vocab = [l.vocab_size for l in tokenizers]

    def get_index_multilanguage(r):
        for i in range(len(tokenizers)):
            sum_v = sum(len_vocab[:i + 1])
            if r < sum(len_vocab[:i + 1]):
                return i, r - sum(len_vocab[:i])

    last_index, v = get_index_multilanguage(ids[0])
    d, q = [], [v]
    for r in ids[1:]:
        index, v = get_index_multilanguage(r)
        if index != last_index:
            d.append(tokenizers[last_index]._id_to_subword(q - 1))
            q = [v]
            last_index = index
        else:
            q.append(v)
    if len(q):
        d.append(tokenizers[last_index]._id_to_subword(q - 1))
    return d

In [29]:
def get_index_multilanguage(r, tokenizers, len_vocab):
    for i in range(len(tokenizers)):
        sum_v = sum(len_vocab[:i + 1])
        if r < sum(len_vocab[:i + 1]):
            return i, r - sum(len_vocab[:i])

In [30]:
get_index_multilanguage(1050, langs, len_vocab)

(2, 23)

In [31]:
words, indices = [], []
for no, ids in enumerate(r[0]):
    last_index, v = get_index_multilanguage(ids, langs, len_vocab)
    w = langs[last_index]._id_to_subword(v - 1)
    if type(w) == bytes:
        w = w.decode()
    words.extend([w, None])
    indices.extend([no, None])

In [32]:
words, indices

(['kal',
  None,
  'au_',
  None,
  'nam',
  None,
  'a_',
  None,
  'say',
  None,
  'a_',
  None,
  'mu',
  None,
  'si',
  None,
  'm_',
  None,
  'say',
  None,
  'a_',
  None,
  'tak',
  None,
  ' ',
  None,
  'su',
  None,
  'ka',
  None,
  ' ',
  None,
  'man',
  None,
  'di_',
  None,
  'kat',
  None,
  'a_',
  None,
  'say',
  None,
  'a_',
  None,
  'mas',
  None,
  'a',
  None,
  'k',
  None],
 [0,
  None,
  1,
  None,
  2,
  None,
  3,
  None,
  4,
  None,
  5,
  None,
  6,
  None,
  7,
  None,
  8,
  None,
  9,
  None,
  10,
  None,
  11,
  None,
  12,
  None,
  13,
  None,
  14,
  None,
  15,
  None,
  16,
  None,
  17,
  None,
  18,
  None,
  19,
  None,
  20,
  None,
  21,
  None,
  22,
  None,
  23,
  None,
  24,
  None])

In [33]:
import six
from malaya_speech.utils import text_encoder

def _trim_underscore_and_tell(token):
    if token.endswith('_'):
        return token[:-1], True
    return token, False

def decode(ids):
    ids = text_encoder.pad_decr(ids)
    subword_ids = ids
    del ids

    subwords_ = []
    prev_bytes = []
    prev_ids = []
    ids = []

    def consume_prev_bytes():
        if prev_bytes:
            subwords_.extend(prev_bytes)
            ids.extend(prev_ids)
        return [], []

    for no, subword_id in enumerate(subword_ids):
        last_index, v = get_index_multilanguage(subword_id, langs, len_vocab)
        subword = langs[last_index]._id_to_subword(v)
        if isinstance(subword, six.binary_type):
            # Byte-encoded
            prev_bytes.append(subword.decode('utf-8', 'replace'))
            if subword == b' ':
                prev_ids.append(None)
            else:
                prev_ids.append(no)
        else:
            # If there were bytes previously, convert to unicode.
            prev_bytes, prev_ids = consume_prev_bytes()
            trimmed, add_space = _trim_underscore_and_tell(subword)
            ids.append(no)
            subwords_.append(trimmed)
            if add_space:
                subwords_.append(' ')
                ids.append(None)
    prev_bytes = consume_prev_bytes()

    return subwords_, ids

words, indices = decode(r[0])
len(words), len(indices)

(33, 33)

In [34]:
def combined_indices(subwords, ids, l, reduction_factor = 160, sample_rate = 16000):
    result, temp_l, temp_r = [], [], []
    for i in range(len(subwords)):
        if ids[i] is not None:
            temp_l.append(subwords[i])
            temp_r.append(l[ids[i]])
        else:
            data = {'text': ''.join(temp_l), 
                    'start': round(temp_r[0],4), 
                    'end': round(temp_r[-1] + (reduction_factor / sample_rate), 4)}
            result.append(data)
            temp_l, temp_r = [], []
    
    if len(temp_l):
        data = {'text': ''.join(temp_l), 
                'start': round(temp_r[0],4), 
                'end': round(temp_r[-1] + (reduction_factor / sample_rate), 4)}
        result.append(data)
    
    return result

In [35]:
combined_indices(words, indices, r[1])

[{'text': 'kalau', 'start': 0.56, 'end': 0.69},
 {'text': 'nama', 'start': 1.28, 'end': 1.41},
 {'text': 'saya', 'start': 1.52, 'end': 1.77},
 {'text': 'musim', 'start': 1.96, 'end': 2.29},
 {'text': 'saya', 'start': 3.12, 'end': 3.25},
 {'text': 'tak', 'start': 3.32, 'end': 3.33},
 {'text': 'suka', 'start': 3.52, 'end': 3.61},
 {'text': 'mandi', 'start': 3.72, 'end': 3.89},
 {'text': 'kata', 'start': 5.12, 'end': 5.17},
 {'text': 'saya', 'start': 5.32, 'end': 5.53},
 {'text': 'masak', 'start': 5.72, 'end': 5.97}]

In [36]:
saver = tf.train.Saver()
saver.save(sess, 'output-base-stack-3mixed-conformer/model.ckpt')

'output-base-stack-3mixed-conformer/model.ckpt'

In [37]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'placeholder' in n.name
        or 'encoded' in n.name
        or 'decoder' in n.name
        or 'ytu' in n.name
        or 'new_states' in n.name
        or 'padded_' in n.name
        or 'initial_states' in n.name
        or 'non_blank' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
    ]
)
strings.split(',')

['X_placeholder',
 'X_len_placeholder',
 'padded_features',
 'padded_lens',
 'transducer/transducer_prediction/transducer_prediction_embedding/embeddings',
 'greedy_decoder',
 'encoded',
 'encoded_placeholder',
 'predicted_placeholder',
 'states_placeholder',
 'ytu',
 'new_states',
 'initial_states',
 'non_blank_transcript',
 'non_blank_stime']

In [38]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [39]:
freeze_graph('output-base-stack-3mixed-conformer', strings)

INFO:tensorflow:Restoring parameters from output-base-stack-3mixed-conformer/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 561 variables.
INFO:tensorflow:Converted 561 variables to const ops.
27392 ops in the final graph.


In [40]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [41]:
g = load_graph('output-base-stack-3mixed-conformer/frozen_model.pb')

In [42]:
input_nodes = [
    'X_placeholder',
    'X_len_placeholder',
    'encoded_placeholder',
    'predicted_placeholder',
    'states_placeholder',
]
output_nodes = [
    'greedy_decoder',
    'encoded',
    'ytu',
    'new_states',
    'padded_features',
    'padded_lens',
    'initial_states',
    'non_blank_transcript',
    'non_blank_stime'
]
inputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in input_nodes}
outputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in output_nodes}

In [43]:
# test_sess = tf.Session(graph = g)

In [44]:
# r = test_sess.run(outputs['greedy_decoder'], feed_dict = {inputs['X_placeholder']: padded, 
#                                                           inputs['X_len_placeholder']: lens})

In [45]:
# for row in r:
#     print(malaya_speech.subword.decode(subwords, row[row > 0]))

In [46]:
# encoded_, padded_lens_, s  = test_sess.run([outputs['encoded'], outputs['padded_lens'], outputs['initial_states']], 
#                                         feed_dict = {inputs['X_placeholder']: padded, 
#                                                      inputs['X_len_placeholder']: lens})

# padded_lens_ = padded_lens_ // conformer_model.conv_subsampling.time_reduction_factor

In [47]:
# i = 0
# r = transducer(
#     enc = encoded_[i],
#     total = padded_lens_[i],
#     initial_states = s,
#     encoded_placeholder = inputs['encoded_placeholder'],
#     predicted_placeholder = inputs['predicted_placeholder'],
#     states_placeholder = inputs['states_placeholder'],
#     ytu = outputs['ytu'],
#     new_states = outputs['new_states'],
#     sess = test_sess,
#     beam_width = 1,
# )

# malaya_speech.subword.decode(subwords, r)

In [48]:
from tensorflow.tools.graph_transforms import TransformGraph

In [49]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

input_nodes = [
    'X_placeholder',
    'X_len_placeholder',
    'encoded_placeholder',
    'predicted_placeholder',
    'states_placeholder',
]
output_nodes = [
    'greedy_decoder',
    'encoded',
    'ytu',
    'new_states',
    'padded_features',
    'padded_lens',
    'initial_states',
    'non_blank_transcript',
    'non_blank_stime'
]

pb = 'output-base-stack-3mixed-conformer/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           input_nodes,
                                           output_nodes, transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [50]:
# g = load_graph('output-base-stack-mixed-conformer/frozen_model.pb.quantized')

In [51]:
# inputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in input_nodes}
# outputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in output_nodes}
# test_sess = tf.Session(graph = g)

In [52]:
# r = test_sess.run(outputs['greedy_decoder'], feed_dict = {inputs['X_placeholder']: padded, 
#                                                           inputs['X_len_placeholder']: lens})

In [53]:
# for row in r:
#     print(malaya_speech.subword.decode(subwords, row[row > 0]))

In [54]:
# encoded_, padded_lens_, s  = test_sess.run([outputs['encoded'], outputs['padded_lens'], outputs['initial_states']], 
#                                         feed_dict = {inputs['X_placeholder']: padded, 
#                                                      inputs['X_len_placeholder']: lens})

# padded_lens_ = padded_lens_ // conformer_model.conv_subsampling.time_reduction_factor

In [55]:
# i = 0
# r = transducer(
#     enc = encoded_[i],
#     total = padded_lens_[i],
#     initial_states = s,
#     encoded_placeholder = inputs['encoded_placeholder'],
#     predicted_placeholder = inputs['predicted_placeholder'],
#     states_placeholder = inputs['states_placeholder'],
#     ytu = outputs['ytu'],
#     new_states = outputs['new_states'],
#     sess = test_sess,
#     beam_width = 1,
# )

# malaya_speech.subword.decode(subwords, r)

In [56]:
b2_application_key_id = os.environ['b2_application_key_id']
b2_application_key = os.environ['b2_application_key']

In [57]:
from b2sdk.v1 import *
info = InMemoryAccountInfo()
b2_api = B2Api(info)
application_key_id = b2_application_key_id
application_key = b2_application_key
b2_api.authorize_account("production", application_key_id, application_key)
file_info = {'how': 'good-file'}
b2_bucket = b2_api.get_bucket_by_name('malaya-speech-model')

In [58]:
directory = 'output-base-stack-3mixed-conformer'
tar = 'output-base-stack-3mixed-conformer.tar.gz'
os.system(f'tar -czvf {tar} {directory}')

0

In [59]:
outPutname = f'pretrained/{tar}'
b2_bucket.upload_local_file(
    local_file=tar,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7fdbfc14e828>

In [60]:
!rm {tar}

In [61]:
file = 'output-base-stack-3mixed-conformer/frozen_model.pb'
outPutname = 'speech-to-text-transducer/conformer-stack-3mixed/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7fdbfc377080>

In [62]:
file = 'output-base-stack-3mixed-conformer/frozen_model.pb.quantized'
outPutname = 'speech-to-text-transducer/conformer-stack-3mixed-quantized/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7fdc787a8f98>

In [63]:
!rm -rf output-base-stack-3mixed-conformer