In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import librosa
import numpy as np
from scipy.signal import lfilter, butter
import decimal
import math

dimension = 512

# for VGGVox v1
def round_half_up(number):
    return int(
        decimal.Decimal(number).quantize(
            decimal.Decimal('1'), rounding = decimal.ROUND_HALF_UP
        )
    )


# for VGGVox v1
def remove_dc_and_dither(sin, sample_rate):
    if sample_rate == 16e3:
        alpha = 0.99
    elif sample_rate == 8e3:
        alpha = 0.999
    else:
        print('Sample rate must be 16kHz or 8kHz only')
        exit(1)
    sin = lfilter([1, -1], [1, -alpha], sin)
    dither = (
        np.random.random_sample(len(sin))
        + np.random.random_sample(len(sin))
        - 1
    )
    spow = np.std(dither)
    sout = sin + 1e-6 * spow * dither
    return sout


# for VGGVox v1
def preemphasis(signal, coeff = 0.95):
    return np.append(signal[0], signal[1:] - coeff * signal[:-1])


# for VGGVox v1
def rolling_window(a, window, step = 1):
    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape = shape, strides = strides)[
        ::step
    ]


# for VGGVox v1
def framesig(
    sig,
    frame_len,
    frame_step,
    winfunc = lambda x: numpy.ones((x,)),
    stride_trick = True,
):
    slen = len(sig)
    frame_len = int(round_half_up(frame_len))
    frame_step = int(round_half_up(frame_step))
    if slen <= frame_len:
        numframes = 1
    else:
        numframes = 1 + int(
            math.ceil((1.0 * slen - frame_len) / frame_step)
        )  # LV

    padlen = int((numframes - 1) * frame_step + frame_len)

    zeros = np.zeros((padlen - slen,))
    padsignal = np.concatenate((sig, zeros))
    if stride_trick:
        win = winfunc(frame_len)
        frames = rolling_window(
            padsignal, window = frame_len, step = frame_step
        )
    else:
        indices = (
            numpy.tile(numpy.arange(0, frame_len), (numframes, 1))
            + numpy.tile(
                numpy.arange(0, numframes * frame_step, frame_step),
                (frame_len, 1),
            ).T
        )
        indices = numpy.array(indices, dtype = numpy.int32)
        frames = padsignal[indices]
        win = numpy.tile(winfunc(frame_len), (numframes, 1))

    return frames * win


# for VGGVox v1
def normalize_frames(m, epsilon = 1e-12):
    return np.array([(v - np.mean(v)) / max(np.std(v), epsilon) for v in m])


# for VGGVox v1
def vggvox_v1(
    signal,
    sample_rate = 16000,
    preemphasis_alpha = 0.97,
    frame_len = 0.005,
    frame_step = 0.0005,
    num_fft = 512,
    buckets = None,
    **kwargs
):
    signal = signal.copy()
    signal *= 2 ** 15
    signal = remove_dc_and_dither(signal, sample_rate)
    signal = preemphasis(signal, coeff = preemphasis_alpha)
    frames = framesig(
        signal,
        frame_len = frame_len * sample_rate,
        frame_step = frame_step * sample_rate,
        winfunc = np.hamming,
    )
    fft = abs(np.fft.fft(frames, n = num_fft))
    fft_norm = normalize_frames(fft.T)

    if buckets:
        rsize = max(k for k in buckets if k <= fft_norm.shape[1])
        rstart = int((fft_norm.shape[1] - rsize) / 2)
        out = fft_norm[:, rstart : rstart + rsize]
        return out

    else:
        if fft_norm.shape[1] < 100:
            fft_norm = np.pad(
                fft_norm, ((0, 0), (0, 100 - fft_norm.shape[1])), 'constant'
            )
        return fft_norm.astype('float32')

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Reshape
from tensorflow.keras.layers import (
    Conv2D,
    ZeroPadding2D,
    MaxPooling2D,
    AveragePooling2D,
)
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Lambda, Activation
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K


def conv_bn_pool(
    inp_tensor,
    layer_idx,
    conv_filters,
    conv_kernel_size,
    conv_strides,
    conv_pad,
    pool = '',
    pool_size = (2, 2),
    pool_strides = None,
    conv_layer_prefix = 'conv',
):
    x = ZeroPadding2D(padding = conv_pad, name = 'pad{}'.format(layer_idx))(
        inp_tensor
    )
    x = Conv2D(
        filters = conv_filters,
        kernel_size = conv_kernel_size,
        strides = conv_strides,
        padding = 'valid',
        name = '{}{}'.format(conv_layer_prefix, layer_idx),
    )(x)
    x = BatchNormalization(
        epsilon = 1e-5, momentum = 1.0, name = 'bn{}'.format(layer_idx)
    )(x)
    x = Activation('relu', name = 'relu{}'.format(layer_idx))(x)
    if pool == 'max':
        x = MaxPooling2D(
            pool_size = pool_size,
            strides = pool_strides,
            name = 'mpool{}'.format(layer_idx),
        )(x)
    elif pool == 'avg':
        x = AveragePooling2D(
            pool_size = pool_size,
            strides = pool_strides,
            name = 'apool{}'.format(layer_idx),
        )(x)
    return x


# Block of layers: Conv --> BatchNorm --> ReLU --> Dynamic average pool (fc6 -> apool6 only)
def conv_bn_dynamic_apool(
    inp_tensor,
    layer_idx,
    conv_filters,
    conv_kernel_size,
    conv_strides,
    conv_pad,
    conv_layer_prefix = 'conv',
):
    x = ZeroPadding2D(padding = conv_pad, name = 'pad{}'.format(layer_idx))(
        inp_tensor
    )
    x = Conv2D(
        filters = conv_filters,
        kernel_size = conv_kernel_size,
        strides = conv_strides,
        padding = 'valid',
        name = '{}{}'.format(conv_layer_prefix, layer_idx),
    )(x)
    x = BatchNormalization(
        epsilon = 1e-5, momentum = 1.0, name = 'bn{}'.format(layer_idx)
    )(x)
    x = Activation('relu', name = 'relu{}'.format(layer_idx))(x)
    x = GlobalAveragePooling2D(name = 'gapool{}'.format(layer_idx))(x)
    x = Reshape((1, 1, conv_filters), name = 'reshape{}'.format(layer_idx))(x)
    return x


class Resnet1D(Model):
    def __init__(self, params = None, is_training = False):
        super(Resnet1D, self).__init__()

    def call(self, inputs, training = None, mask = None):
        inp = inputs['features_input']
        x = conv_bn_pool(
            inp,
            layer_idx = 1,
            conv_filters = 96,
            conv_kernel_size = (7, 7),
            conv_strides = (2, 2),
            conv_pad = (1, 1),
            pool = 'max',
            pool_size = (3, 3),
            pool_strides = (2, 2),
        )
        x = conv_bn_pool(
            x,
            layer_idx = 2,
            conv_filters = 256,
            conv_kernel_size = (5, 5),
            conv_strides = (2, 2),
            conv_pad = (1, 1),
            pool = 'max',
            pool_size = (3, 3),
            pool_strides = (2, 2),
        )
        x = conv_bn_pool(
            x,
            layer_idx = 3,
            conv_filters = 384,
            conv_kernel_size = (3, 3),
            conv_strides = (1, 1),
            conv_pad = (1, 1),
        )
        x = conv_bn_pool(
            x,
            layer_idx = 4,
            conv_filters = 256,
            conv_kernel_size = (3, 3),
            conv_strides = (1, 1),
            conv_pad = (1, 1),
        )
        x = conv_bn_pool(
            x,
            layer_idx = 5,
            conv_filters = 256,
            conv_kernel_size = (3, 3),
            conv_strides = (1, 1),
            conv_pad = (1, 1),
            pool = 'max',
            pool_size = (5, 3),
            pool_strides = (3, 2),
        )
        x = conv_bn_dynamic_apool(
            x,
            layer_idx = 6,
            conv_filters = 4096,
            conv_kernel_size = (9, 1),
            conv_strides = (1, 1),
            conv_pad = (0, 0),
            conv_layer_prefix = 'fc',
        )
        x = conv_bn_pool(
            x,
            layer_idx = 7,
            conv_filters = 1024,
            conv_kernel_size = (1, 1),
            conv_strides = (1, 1),
            conv_pad = (0, 0),
            conv_layer_prefix = 'fc',
        )
        x = Lambda(lambda y: K.l2_normalize(y, axis = 3), name = 'norm')(x)
        x = Conv2D(
            filters = 1024,
            kernel_size = (1, 1),
            strides = (1, 1),
            padding = 'valid',
            name = 'fc8',
        )(x)
        return x

In [4]:
class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, [None, 512, None, 1])
        inputs = {'features_input': self.X}
        model = Resnet1D(is_training = True)
        
        logits = model.call(inputs)
        logits = logits[:, 0, 0, :]
        logits = tf.layers.dense(logits, 2)
        self.logits = tf.identity(logits, name = 'logits')

In [6]:
ckpt_path = 'output-vggvox-v1-vad/model.ckpt-170000'

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


In [8]:
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ckpt_path)

INFO:tensorflow:Restoring parameters from output-vggvox-v1-vad/model.ckpt-170000


In [9]:
saver = tf.train.Saver()
saver.save(sess, 'vggvox-v1/model.ckpt')

'vggvox-v1/model.ckpt'

In [10]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
        and 'Assign' not in n.name
    ]
)

In [11]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [12]:
freeze_graph('vggvox-v1', strings)

INFO:tensorflow:Restoring parameters from vggvox-v1/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 46 variables.
INFO:tensorflow:Converted 46 variables to const ops.
324 ops in the final graph.


In [13]:
# def load_graph(frozen_graph_filename):
#     with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
#         graph_def = tf.GraphDef()
#         graph_def.ParseFromString(f.read())
#     with tf.Graph().as_default() as graph:
#         tf.import_graph_def(graph_def)
#     return graph

def load_graph(frozen_graph_filename, **kwargs):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    # https://github.com/onnx/tensorflow-onnx/issues/77#issuecomment-445066091
    # to fix import T5
    for node in graph_def.node:
        if node.op == 'RefSwitch':
            node.op = 'Switch'
            for index in xrange(len(node.input)):
                if 'moving_' in node.input[index]:
                    node.input[index] = node.input[index] + '/read'
        elif node.op == 'AssignSub':
            node.op = 'Sub'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'AssignAdd':
            node.op = 'Add'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
        elif node.op == 'Assign':
            node.op = 'Identity'
            if 'use_locking' in node.attr:
                del node.attr['use_locking']
            if 'validate_shape' in node.attr:
                del node.attr['validate_shape']
            if len(node.input) == 2:
                node.input[0] = node.input[1]
                del node.input[1]

    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [15]:
g = load_graph('vggvox-v1/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')