In [1]:
from constants import NUM_FBANKS, NUM_FRAMES

In [2]:
import numpy as np
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Lambda, Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [3]:
class DeepSpeakerModel:

    # I thought it was 3 but maybe energy is added at a 4th dimension.
    # would be better to have 4 dimensions:
    # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
    # this seems to help match the parameter counts.
    
    def __init__(self):
        self.clipped_relu_count = 0

    def keras_model(self):
        return self.m

    def get_weights(self):
        w = self.m.get_weights()
        if self.include_softmax:
            w.pop()  # last 2 are the W_softmax and b_softmax.
            w.pop()
        return w

    def clipped_relu(self, inputs):
        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
        self.clipped_relu_count += 1
        return relu

    def identity_block(self, input_tensor, kernel_size, filters, stage, block):
        conv_name_base = f'res{stage}_{block}_branch'

        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001),
                   name=conv_name_base + '_2a')(input_tensor)
        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
        x = self.clipped_relu(x)

        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001),
                   name=conv_name_base + '_2b')(x)
        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)

        x = self.clipped_relu(x)

        x = layers.add([x, input_tensor])
        x = self.clipped_relu(x)
        return x

    def conv_and_res_block(self, inp, filters, stage):
        conv_name = 'conv{}-s'.format(filters)
        # TODO: why kernel_regularizer?
        o = Conv2D(filters,
                   kernel_size=5,
                   strides=2,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
        o = BatchNormalization(name=conv_name + '_bn')(o)
        o = self.clipped_relu(o)
        for i in range(3):
            o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
        return o

    def cnn_component(self, inp):
        x = self.conv_and_res_block(inp, 64, stage=1)
        x = self.conv_and_res_block(x, 128, stage=2)
        x = self.conv_and_res_block(x, 256, stage=3)
        x = self.conv_and_res_block(x, 512, stage=4)
        return x

    def set_weights(self, w):
        for layer, layer_w in zip(self.m.layers, w):
            layer.set_weights(layer_w)
            logger.info(f'Setting weights for [{layer.name}]...')
            
deepspeaker = DeepSpeakerModel()

In [4]:
class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, (None, None, NUM_FBANKS, 1))
        x = deepspeaker.cnn_component(self.X)
        x = Reshape((-1, 2048))(x)
        x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
        x = Dense(512, name='affine')(x)
        x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
        self.logits = tf.identity(x, name = 'logits')
        print(self.logits)

In [5]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Tensor("logits:0", shape=(?, 512), dtype=float32)


In [6]:
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'out/vggvox.ckpt')

INFO:tensorflow:Restoring parameters from out/vggvox.ckpt


In [8]:
from glob import glob

wavs = glob('*.wav')
wavs

['mas-aisyah.wav',
 'shafiqah-idayu.wav',
 'husein-zolkepli.wav',
 'khalil-nooh.wav']

In [9]:
from audio import read_mfcc
import numpy as np
from constants import SAMPLE_RATE, NUM_FRAMES
SAMPLE_RATE, NUM_FRAMES

read_mfcc(wavs[0], SAMPLE_RATE).shape

(166, 64)

In [10]:
def f(file):
    return np.array([np.expand_dims(read_mfcc(file, SAMPLE_RATE), -1)])

In [11]:
mfccs = [f(file) for file in wavs]

In [12]:
mfccs[0].shape

(1, 166, 64, 1)

In [13]:
def pred(mfcc):
    return sess.run(model.logits, feed_dict = {model.X: mfcc})

In [14]:
r = [pred(mfcc) for mfcc in mfccs]

In [15]:
r = np.concatenate(r)

In [17]:
from scipy.spatial.distance import cdist

1 - cdist(r, r, metric='cosine')

array([[1.        , 0.32191291, 0.19461663, 0.23876474],
       [0.32191291, 1.        , 0.24097232, 0.23889481],
       [0.19461663, 0.24097232, 1.        , 0.33842044],
       [0.23876474, 0.23889481, 0.33842044, 1.        ]])

In [20]:
saver = tf.train.Saver()
saver.save(sess, 'deep-speaker-out/model.ckpt')

'deep-speaker-out/model.ckpt'

In [21]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
    ]
)

In [22]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [23]:
freeze_graph('deep-speaker-out', strings)

INFO:tensorflow:Restoring parameters from deep-speaker-out/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 170 variables.
INFO:tensorflow:Converted 170 variables to const ops.
1238 ops in the final graph.


In [24]:
def load_graph(frozen_graph_filename, **kwargs):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091
    # to fix import T5
    for node in graph_def.node:
        if node.op == 'RefSwitch':
            node.op = 'Switch'
            for index in xrange(len(node.input)):
                if 'moving_' in node.input[index]:
                    node.input[index] = node.input[index] + '/read'
        elif node.op == 'AssignSub':
            node.op = 'Sub'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'AssignAdd':
            node.op = 'Add'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'Assign':
            node.op = 'Identity'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
            if 'validate_shape' in node.attr:
                del node.attr['validate_shape']
            if len(node.input) == 2:
                node.input[0] = node.input[1]
                del node.input[1]

    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph


In [25]:
g = load_graph('deep-speaker-out/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')

In [26]:
test_sess = tf.InteractiveSession(graph = g)



In [27]:
test_sess.run(logits, feed_dict = {x: mfccs[0]})

array([[-5.52379973e-02,  2.57481169e-02, -6.28858060e-02,
         4.04050201e-02, -2.26131728e-04,  5.66876568e-02,
         1.18383556e-03, -4.22953954e-03, -6.06642552e-02,
        -5.22606596e-02,  1.16289295e-02,  1.49290841e-02,
         6.79407595e-03, -5.07222116e-02,  2.65879724e-02,
        -1.28893480e-02, -1.13314763e-03, -2.55147684e-02,
        -1.35199614e-02, -2.59618666e-02,  2.09014192e-02,
        -1.54670011e-02, -2.34773718e-02,  4.52116244e-02,
        -8.32604337e-03,  2.89497431e-02,  5.44677228e-02,
         1.08679058e-02, -3.14984769e-02,  7.59129599e-02,
        -4.80681919e-02,  3.53878178e-02, -1.00509912e-01,
         5.17397560e-03, -1.94912236e-02,  6.09882176e-02,
         2.42635068e-02, -3.76226730e-03,  3.01365778e-02,
        -2.33786886e-05,  1.33099407e-01, -3.13142762e-02,
        -2.57999636e-02,  3.96155901e-02,  4.48607504e-02,
        -1.67979449e-02, -2.63978336e-02,  1.43880742e-02,
         5.40309884e-02,  2.99838297e-02,  5.72429821e-0