In [1]:
from constants import NUM_FBANKS, NUM_FRAMES

In [8]:
import numpy as np
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Lambda, Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

In [12]:
class DeepSpeakerModel:

    # I thought it was 3 but maybe energy is added at a 4th dimension.
    # would be better to have 4 dimensions:
    # MFCC, DIFF(MFCC), DIFF(DIFF(MFCC)), ENERGIES (probably tiled across the frequency domain).
    # this seems to help match the parameter counts.
    
    def __init__(self):
        self.clipped_relu_count = 0

    def keras_model(self):
        return self.m

    def get_weights(self):
        w = self.m.get_weights()
        if self.include_softmax:
            w.pop()  # last 2 are the W_softmax and b_softmax.
            w.pop()
        return w

    def clipped_relu(self, inputs):
        relu = Lambda(lambda y: K.minimum(K.maximum(y, 0), 20), name=f'clipped_relu_{self.clipped_relu_count}')(inputs)
        self.clipped_relu_count += 1
        return relu

    def identity_block(self, input_tensor, kernel_size, filters, stage, block):
        conv_name_base = f'res{stage}_{block}_branch'

        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001),
                   name=conv_name_base + '_2a')(input_tensor)
        x = BatchNormalization(name=conv_name_base + '_2a_bn')(x)
        x = self.clipped_relu(x)

        x = Conv2D(filters,
                   kernel_size=kernel_size,
                   strides=1,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001),
                   name=conv_name_base + '_2b')(x)
        x = BatchNormalization(name=conv_name_base + '_2b_bn')(x)

        x = self.clipped_relu(x)

        x = layers.add([x, input_tensor])
        x = self.clipped_relu(x)
        return x

    def conv_and_res_block(self, inp, filters, stage):
        conv_name = 'conv{}-s'.format(filters)
        # TODO: why kernel_regularizer?
        o = Conv2D(filters,
                   kernel_size=5,
                   strides=2,
                   activation=None,
                   padding='same',
                   kernel_initializer='glorot_uniform',
                   kernel_regularizer=regularizers.l2(l=0.0001), name=conv_name)(inp)
        o = BatchNormalization(name=conv_name + '_bn')(o)
        o = self.clipped_relu(o)
        for i in range(3):
            o = self.identity_block(o, kernel_size=3, filters=filters, stage=stage, block=i)
        return o

    def cnn_component(self, inp):
        x = self.conv_and_res_block(inp, 64, stage=1)
        x = self.conv_and_res_block(x, 128, stage=2)
        x = self.conv_and_res_block(x, 256, stage=3)
        x = self.conv_and_res_block(x, 512, stage=4)
        return x

    def set_weights(self, w):
        for layer, layer_w in zip(self.m.layers, w):
            layer.set_weights(layer_w)
            logger.info(f'Setting weights for [{layer.name}]...')
            
deepspeaker = DeepSpeakerModel()

In [13]:
class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, (None, None, NUM_FBANKS, 1))
        x = deepspeaker.cnn_component(self.X)
        x = Reshape((-1, 2048))(x)
        x = Lambda(lambda y: K.mean(y, axis=1), name='average')(x)
        x = Dense(512, name='affine')(x)
        x = Lambda(lambda y: K.l2_normalize(y, axis=1), name='ln')(x)
        self.logits = x
        print(self.logits)

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)

Tensor("ln/l2_normalize:0", shape=(?, 512), dtype=float32)


In [15]:
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, 'out/vggvox.ckpt')

INFO:tensorflow:Restoring parameters from out/vggvox.ckpt


In [16]:
from glob import glob

wavs = glob('*.wav')
wavs

['khalil-nooh.wav',
 'husein-zolkepli.wav',
 'mas-aisyah.wav',
 'shafiqah-idayu.wav']

In [18]:
from audio import read_mfcc
import numpy as np
from constants import SAMPLE_RATE, NUM_FRAMES
SAMPLE_RATE, NUM_FRAMES

read_mfcc(wavs[0], SAMPLE_RATE).shape

(183, 64)

In [19]:
def f(file):
    return np.array([np.expand_dims(read_mfcc(file, SAMPLE_RATE), -1)])

In [20]:
mfccs = [f(file) for file in wavs]

In [23]:
def pred(mfcc):
    return sess.run(model.logits, feed_dict = {model.X: mfcc})

In [24]:
r = [pred(mfcc) for mfcc in mfccs]

In [25]:
r = np.concatenate(r)
r.shape

(4, 512)

In [28]:
from scipy.spatial.distance import cdist

cdist(r, r, metric='cosine')

array([[0.        , 0.66157947, 0.76123523, 0.76110501],
       [0.66157947, 0.        , 0.80538329, 0.75902755],
       [0.76123523, 0.80538329, 0.        , 0.678087  ],
       [0.76110501, 0.75902755, 0.678087  , 0.        ]])

In [29]:
!tar -zcf deep-speaker.tar.gz out