In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from malaya_speech.train.model import hubert, ctc
from malaya_speech.train.model.conformer.model import Model as ConformerModel
import malaya_speech
import tensorflow as tf
import numpy as np
import json
from glob import glob
import string

In [3]:
unique_vocab = [''] + list(
    string.ascii_lowercase + string.digits
) + [' ']
len(unique_vocab)

38

In [4]:
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')

In [5]:
training = True

class Encoder:
    def __init__(self, config):
        self.config = config
        self.encoder = ConformerModel(**self.config)

    def __call__(self, x, input_mask, training = True):
        return self.encoder(x, training = training)

In [6]:
config_conformer = malaya_speech.config.conformer_tiny_encoder_config
config_conformer['subsampling']['type'] = 'none'
config_conformer['dropout'] = 0.0
encoder = Encoder(config_conformer)
cfg = hubert.HuBERTConfig(
    extractor_mode='layer_norm',
    dropout=0.0,
    attention_dropout=0.0,
    encoder_layerdrop=0.0,
    dropout_input=0.0,
    dropout_features=0.0,
    final_dim=128,
)
model = hubert.Model(cfg, encoder, ['pad', 'eos', 'unk'] + [str(i) for i in range(100)])
r = model(X, padding_mask = X_len, features_only = True, mask = False)
logits = tf.layers.dense(r['x'], len(unique_vocab) + 1)
seq_lens = tf.reduce_sum(
    tf.cast(tf.logical_not(r['padding_mask']), tf.int32), axis = 1
)


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [7]:
logits = tf.transpose(logits, [1, 0, 2])
logits = tf.identity(logits, name = 'logits')
seq_lens = tf.identity(seq_lens, name = 'seq_lens')

In [29]:
from pyctcdecode import build_ctcdecoder
import kenlm

kenlm_model = kenlm.Model('out.trie.klm')
decoder = build_ctcdecoder(
    unique_vocab + ['_'],
    kenlm_model,
    alpha=0.2,
    beta=1.0,
    ctc_token_idx=len(unique_vocab)
)

In [15]:
logits_t = tf.nn.softmax(tf.transpose(logits, [1, 0, 2]))

In [16]:
files = [
    'speech/record/savewav_2020-11-26_22-36-06_294832.wav',
    'speech/record/savewav_2020-11-26_22-40-56_929661.wav',
    'speech/record/675.wav',
    'speech/record/664.wav',
    'speech/example-speaker/husein-zolkepli.wav',
    'speech/example-speaker/mas-aisyah.wav',
    'speech/example-speaker/khalil-nooh.wav',
    'speech/example-speaker/shafiqah-idayu.wav',
    'speech/khutbah/wadi-annuar.wav',
]

ys = [malaya_speech.load(f)[0] for f in files]
padded, lens = malaya_speech.padding.sequence_1d(ys, return_len = True)

In [19]:
r = sess.run([logits_t, seq_lens], feed_dict = {X: padded, X_len: lens})

In [20]:
r[0].shape, r[1]

((9, 499, 39),
 array([299, 250, 279, 200, 281, 143, 196, 175, 499], dtype=int32))

In [21]:
r[0][0,:r[1][0]]

array([[1.04469749e-11, 9.53841954e-05, 1.79659619e-04, ...,
        4.99524366e-10, 7.41766598e-06, 9.97852087e-01],
       [1.78250800e-11, 3.19799554e-04, 3.49451111e-05, ...,
        9.40413747e-10, 7.24381389e-05, 9.96281087e-01],
       [2.35856970e-08, 1.06788284e-04, 1.74917616e-02, ...,
        2.54488441e-06, 9.51646070e-05, 2.51933992e-01],
       ...,
       [1.77272070e-13, 7.85718748e-06, 3.84613941e-07, ...,
        1.65338210e-09, 1.56802955e-07, 9.99935985e-01],
       [1.58478008e-13, 1.04536739e-05, 2.63559798e-07, ...,
        1.10291642e-09, 1.71716664e-07, 9.99942064e-01],
       [1.27815510e-13, 1.04950568e-05, 1.85754615e-07, ...,
        6.03708306e-10, 4.97670612e-07, 9.99933362e-01]], dtype=float32)

In [32]:
out = decoder.decode_beams(r[0][3,:r[1][3]], prune_history=True)
out

[('pilihan tepat apabila dia kini lebih berani dan',
  <kenlm.State at 0x7f1db04272b0>,
  [('pilihan', (13, 28)),
   ('tepat', (33, 47)),
   ('apabila', (55, 73)),
   ('dia', (104, 109)),
   ('kini', (118, 126)),
   ('lebih', (130, 140)),
   ('berani', (144, 162)),
   ('dan', (175, 186))],
  -0.7227686417719876,
  -1.4217007573524127)]

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'hubert-conformer-tiny-ctc-char/model.ckpt-1080000')

INFO:tensorflow:Restoring parameters from hubert-conformer-tiny-ctc-char/model.ckpt-1080000


In [9]:
saver = tf.train.Saver()
saver.save(sess, 'output-hubert-conformer-tiny-ctc/model.ckpt')

'output-hubert-conformer-tiny-ctc/model.ckpt'

In [10]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'placeholder' in n.name
        or 'logits' in n.name
        or 'seq_lens' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
    ]
)
strings.split(',')

['X_placeholder',
 'X_len_placeholder',
 'mask_emb',
 'label_embs_concat',
 'dense/kernel',
 'dense/bias',
 'logits',
 'seq_lens']

In [11]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [12]:
freeze_graph('output-hubert-conformer-tiny-ctc', strings)

INFO:tensorflow:Restoring parameters from output-hubert-conformer-tiny-ctc/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 303 variables.
INFO:tensorflow:Converted 303 variables to const ops.
4971 ops in the final graph.


In [13]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [16]:
g = load_graph('output-hubert-conformer-tiny-ctc/frozen_model.pb')

In [17]:
input_nodes = [
    'X_placeholder',
    'X_len_placeholder',
]
output_nodes = [
    'logits',
    'seq_lens',
]
inputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in input_nodes}
outputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in output_nodes}

In [18]:
test_sess = tf.Session(graph = g)

In [19]:
r = test_sess.run(outputs['logits'], feed_dict = {inputs['X_placeholder']: padded, 
                                                          inputs['X_len_placeholder']: lens})

In [21]:
from tensorflow.tools.graph_transforms import TransformGraph

In [22]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

pb = 'output-hubert-conformer-tiny-ctc/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           input_nodes,
                                           output_nodes, transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [26]:
g = load_graph('output-hubert-conformer-large-ctc/frozen_model.pb.quantized')

In [2]:
!rm output-hubert-conformer-tiny-ctc.tar.gz

In [3]:
!tar -czvf output-hubert-conformer-tiny-ctc-v2.tar.gz output-hubert-conformer-tiny-ctc

output-hubert-conformer-tiny-ctc/
output-hubert-conformer-tiny-ctc/model.ckpt.index
output-hubert-conformer-tiny-ctc/model.ckpt.data-00000-of-00001
output-hubert-conformer-tiny-ctc/frozen_model.pb.quantized
output-hubert-conformer-tiny-ctc/checkpoint
output-hubert-conformer-tiny-ctc/model.ckpt.meta
output-hubert-conformer-tiny-ctc/frozen_model.pb


In [5]:
import os

b2_application_key_id = os.environ['b2_application_key_id']
b2_application_key = os.environ['b2_application_key']

In [6]:
from b2sdk.v1 import *
info = InMemoryAccountInfo()
b2_api = B2Api(info)
application_key_id = b2_application_key_id
application_key = b2_application_key
b2_api.authorize_account("production", application_key_id, application_key)
file_info = {'how': 'good-file'}
b2_bucket = b2_api.get_bucket_by_name('malaya-speech-model')

In [7]:
key = 'output-hubert-conformer-tiny-ctc-v2.tar.gz'
outPutname = "pretrained/output-hubert-conformer-tiny-ctc-v2.tar.gz"
b2_bucket.upload_local_file(
    local_file=key,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f682de10e48>

In [8]:
file = 'output-hubert-conformer-tiny-ctc/frozen_model.pb'
outPutname = 'speech-to-text-ctc/hubert-conformer-tiny/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f6819b3f160>

In [9]:
file = 'output-hubert-conformer-tiny-ctc/frozen_model.pb.quantized'
outPutname = 'speech-to-text-ctc/hubert-conformer-tiny-quantized/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f6819bf1358>

In [10]:
!rm -rf output-hubert-conformer-tiny-ctc output-hubert-conformer-tiny-ctc-v2.tar.gz