In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['MALAYA_USE_HUGGINGFACE'] = 'true'

In [2]:
import malaya_speech
import tensorflow as tf
import malaya_speech.train.model.conformer as conformer

In [3]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = featurizer(X[i, :X_len[i]])
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features, padded_lens)

def condition(i, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features, padded_lens):
    f = features.read(i)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features.set_shape((None, None, 80))
padded_features = tf.expand_dims(padded_features, -1)
padded_features, padded_lens

(<tf.Tensor 'ExpandDims:0' shape=(?, ?, 80, 1) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [4]:
padded_features = tf.identity(padded_features, name = 'padded_features')
padded_lens = tf.identity(padded_lens, name = 'padded_lens')

In [5]:
config = malaya_speech.config.conformer_base_encoder_config
conformer_model = conformer.Model(
    kernel_regularizer=None, bias_regularizer=None, **config
)

In [6]:
seq = conformer_model(padded_features)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [7]:
embedding_dim = 512
first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
pooled_output = tf.keras.layers.Dense(embedding_dim, activation=None,
                                   use_bias=True, trainable=True)(first_token_tensor)
pooled_output

<tf.Tensor 'dense/BiasAdd:0' shape=(?, 512) dtype=float32>

In [8]:
y = tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, 1))(pooled_output)
y

<tf.Tensor 'lambda/l2_normalize:0' shape=(?, 512) dtype=float32>

In [9]:
y = tf.identity(y, name = 'logits')
y

<tf.Tensor 'logits:0' shape=(?, 512) dtype=float32>

In [10]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [11]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'conformer-base-voxceleb/model.ckpt-1675000')

INFO:tensorflow:Restoring parameters from conformer-base-voxceleb/model.ckpt-1675000


In [13]:
f = '1.wav'
y_, _ = malaya_speech.load(f)

In [15]:
%%time

sess.run(y, feed_dict = {X: [y_], X_len: [len(y_)]})[0]

CPU times: user 473 ms, sys: 125 ms, total: 598 ms
Wall time: 128 ms


array([-3.42943408e-02,  1.74337998e-02, -5.96995614e-02,  5.15927970e-02,
        3.78360674e-02,  1.11374259e-02, -1.16238995e-02, -2.29922291e-02,
       -4.52205613e-02, -3.83505672e-02, -4.44434397e-02,  3.16496752e-02,
        7.34405369e-02, -1.33032724e-02, -3.65533531e-02,  3.79834063e-02,
        1.60339251e-02,  8.20604563e-02, -3.70756001e-03,  7.90271536e-02,
        2.28232960e-03, -1.27660821e-03, -3.98752540e-02, -3.08089145e-03,
        1.07098734e-02, -2.10002940e-02, -1.23413485e-02,  1.56773627e-02,
        4.62571941e-02, -6.75439695e-03,  1.66410021e-02, -1.38333337e-02,
        1.52159669e-02,  3.68148647e-02, -5.00210002e-02, -2.63048355e-02,
        5.99954370e-03, -3.39824910e-04, -4.28575762e-02, -9.72654521e-02,
        5.74219273e-03, -1.09113203e-02, -5.23459502e-02,  9.86790359e-02,
        8.79026800e-02,  3.51844132e-02,  3.17510590e-02,  2.14424580e-02,
        1.88377630e-02, -3.91232520e-02,  5.81526197e-02, -7.44219869e-02,
        8.56701937e-03, -

In [16]:
saver = tf.train.Saver()
saver.save(sess, 'output-conformer-base/model.ckpt')

'output-conformer-base/model.ckpt'

In [17]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'gather' in n.op.lower()
        or 'placeholder' in n.name
        or 'logits' in n.name)
        and 'adam' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
        and 'ReadVariableOp' not in n.name
        and 'Gather' not in n.name
    ]
)
strings.split(',')

['X_placeholder', 'X_len_placeholder', 'logits']

In [18]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [19]:
freeze_graph('output-conformer-base', strings)

INFO:tensorflow:Restoring parameters from output-conformer-base/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 584 variables.
INFO:tensorflow:Converted 584 variables to const ops.
9608 ops in the final graph.


In [20]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
                
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
        
    return graph

In [21]:
g = load_graph('output-conformer-base/frozen_model.pb')

In [22]:
input_nodes = [
    'X_placeholder',
    'X_len_placeholder',
]
output_nodes = [
    'logits'
]
inputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in input_nodes}
outputs = {n: g.get_tensor_by_name(f'import/{n}:0') for n in output_nodes}

In [25]:
test_sess = tf.Session(graph = g)

In [26]:
test_sess.run(outputs['logits'], feed_dict = {inputs['X_placeholder']: [y_], 
                                                          inputs['X_len_placeholder']: [len(y_)]})

array([[-3.42765599e-02,  1.76603459e-02, -5.95744103e-02,
         5.17317727e-02,  3.78416143e-02,  1.10196769e-02,
        -1.17707234e-02, -2.30666585e-02, -4.52118404e-02,
        -3.82358618e-02, -4.45492789e-02,  3.14630568e-02,
         7.36526474e-02, -1.33846123e-02, -3.66452113e-02,
         3.78675610e-02,  1.59521252e-02,  8.21145326e-02,
        -3.83047829e-03,  7.90980905e-02,  2.30665226e-03,
        -1.29932910e-03, -3.99338938e-02, -3.07450001e-03,
         1.08003030e-02, -2.09528394e-02, -1.24771409e-02,
         1.57241803e-02,  4.62129563e-02, -6.75419858e-03,
         1.66078005e-02, -1.38759408e-02,  1.53301060e-02,
         3.68314274e-02, -5.01642451e-02, -2.62113549e-02,
         5.97903179e-03, -2.13958847e-04, -4.28931899e-02,
        -9.74427089e-02,  5.56918001e-03, -1.08987261e-02,
        -5.24564758e-02,  9.87501815e-02,  8.78688022e-02,
         3.53347510e-02,  3.18520144e-02,  2.14252993e-02,
         1.89526957e-02, -3.92914526e-02,  5.82395568e-0

In [23]:
from tensorflow.tools.graph_transforms import TransformGraph

In [24]:
transforms = ['add_default_attributes',
             'remove_nodes(op=Identity, op=CheckNumerics, op=Dropout)',
             'fold_batch_norms',
             'fold_old_batch_norms',
             'quantize_weights(fallback_min=-10, fallback_max=10)',
             'strip_unused_nodes',
             'sort_by_execution_order']

input_nodes = [
    'X_placeholder',
    'X_len_placeholder',
]
output_nodes = [
    'logits'
]

pb = 'output-conformer-base/frozen_model.pb'

input_graph_def = tf.GraphDef()
with tf.gfile.FastGFile(pb, 'rb') as f:
    input_graph_def.ParseFromString(f.read())

transformed_graph_def = TransformGraph(input_graph_def, 
                                           input_nodes,
                                           output_nodes, transforms)
    
with tf.gfile.GFile(f'{pb}.quantized', 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())

Instructions for updating:
Use tf.gfile.GFile.


In [29]:
b2_application_key_id = os.environ['b2_application_key_id']
b2_application_key = os.environ['b2_application_key']

In [30]:
from b2sdk.v1 import *
info = InMemoryAccountInfo()
b2_api = B2Api(info)
application_key_id = b2_application_key_id
application_key = b2_application_key
b2_api.authorize_account("production", application_key_id, application_key)
file_info = {'how': 'good-file'}
b2_bucket = b2_api.get_bucket_by_name('malaya-speech-model')

In [27]:
!tar -cf output-conformer-base.tar output-conformer-base

In [31]:
file = 'output-conformer-base.tar'
outPutname = 'pretrained/output-conformer-base-speaker-embedding.tar'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)

<b2sdk.file_version.FileVersionInfo at 0x7f33bc3dceb8>

In [32]:
file = 'output-conformer-base/frozen_model.pb'
outPutname = 'speaker-vector/conformer-base/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)


<b2sdk.file_version.FileVersionInfo at 0x7f33bc3930f0>

In [33]:
file = 'output-conformer-base/frozen_model.pb.quantized'
outPutname = 'speaker-vector/conformer-base-quantized/model.pb'
b2_bucket.upload_local_file(
    local_file=file,
    file_name=outPutname,
    file_infos=file_info,
)


<b2sdk.file_version.FileVersionInfo at 0x7f33bc5863c8>

In [34]:
from malaya_boilerplate.huggingface import upload_dict

In [35]:
files_mapping = {'output-conformer-base.tar': 'output-conformer-base.tar'}
upload_dict(model = 'pretrained-speaker-embedding', files_mapping = files_mapping)

In [36]:
files_mapping = {'output-conformer-base/frozen_model.pb': 'model.pb'}
upload_dict(model = 'speaker-vector-conformer-base', files_mapping = files_mapping)

In [37]:
files_mapping = {'output-conformer-base/frozen_model.pb.quantized': 'model.pb'}
upload_dict(model = 'speaker-vector-conformer-base-quantized', files_mapping = files_mapping)