In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from tensorflow.keras import layers
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import Activation, Conv1D, Conv2D, Input, Lambda
from tensorflow.keras.layers import BatchNormalization, Flatten, Dense, Reshape
from tensorflow.keras.layers import MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D

In [3]:
weight_decay = 1e-4

In [4]:
def identity_block_2D(input_tensor, kernel_size, filters, stage, block, trainable=True):
    """The identity block is the block that has no conv layer at shortcut.
    # Arguments
        input_tensor: input tensor
        kernel_size: default 3, the kernel size of middle conv layer at main path
        filters: list of integers, the filterss of 3 conv layer at main path
        stage: integer, current stage label, used for generating layer names
        block: 'a','b'..., current block label, used for generating layer names
    # Returns
        Output tensor for the block.
    """
    filters1, filters2, filters3 = filters
    bn_axis = 3

    conv_name_1 = 'conv' + str(stage) + '_' + str(block) + '_1x1_reduce'
    bn_name_1 = 'conv' + str(stage) + '_' + str(block) + '_1x1_reduce/bn'
    x = Conv2D(filters1, (1, 1),
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_1)(input_tensor)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_1)(x)
    x = Activation('relu')(x)

    conv_name_2 = 'conv' + str(stage) + '_' + str(block) + '_3x3'
    bn_name_2 = 'conv' + str(stage) + '_' + str(block) + '_3x3/bn'
    x = Conv2D(filters2, kernel_size,
               padding='same',
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_2)(x)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_2)(x)
    x = Activation('relu')(x)

    conv_name_3 = 'conv' + str(stage) + '_' + str(block) + '_1x1_increase'
    bn_name_3 = 'conv' + str(stage) + '_' + str(block) + '_1x1_increase/bn'
    x = Conv2D(filters3, (1, 1),
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_3)(x)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_3)(x)

    x = layers.add([x, input_tensor])
    x = Activation('relu')(x)
    return x


def conv_block_2D(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), trainable=True):
    """A block that has a conv layer at shortcut.
    # Arguments
        input_tensor: input tensor
        kernel_size: default 3, the kernel size of middle conv layer at main path
        filters: list of integers, the filterss of 3 conv layer at main path
        stage: integer, current stage label, used for generating layer names
        block: 'a','b'..., current block label, used for generating layer names
    # Returns
        Output tensor for the block.
    Note that from stage 3, the first conv layer at main path is with strides=(2,2)
    And the shortcut should have strides=(2,2) as well
    """
    filters1, filters2, filters3 = filters
    bn_axis = 3

    conv_name_1 = 'conv' + str(stage) + '_' + str(block) + '_1x1_reduce'
    bn_name_1 = 'conv' + str(stage) + '_' + str(block) + '_1x1_reduce/bn'
    x = Conv2D(filters1, (1, 1),
               strides=strides,
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_1)(input_tensor)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_1)(x)
    x = Activation('relu')(x)

    conv_name_2 = 'conv' + str(stage) + '_' + str(block) + '_3x3'
    bn_name_2 = 'conv' + str(stage) + '_' + str(block) + '_3x3/bn'
    x = Conv2D(filters2, kernel_size, padding='same',
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_2)(x)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_2)(x)
    x = Activation('relu')(x)

    conv_name_3 = 'conv' + str(stage) + '_' + str(block) + '_1x1_increase'
    bn_name_3 = 'conv' + str(stage) + '_' + str(block) + '_1x1_increase/bn'
    x = Conv2D(filters3, (1, 1),
               kernel_initializer='orthogonal',
               use_bias=False,
               trainable=trainable,
               kernel_regularizer=l2(weight_decay),
               name=conv_name_3)(x)
    x = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_3)(x)

    conv_name_4 = 'conv' + str(stage) + '_' + str(block) + '_1x1_proj'
    bn_name_4 = 'conv' + str(stage) + '_' + str(block) + '_1x1_proj/bn'
    shortcut = Conv2D(filters3, (1, 1), strides=strides,
                      kernel_initializer='orthogonal',
                      use_bias=False,
                      trainable=trainable,
                      kernel_regularizer=l2(weight_decay),
                      name=conv_name_4)(input_tensor)
    shortcut = BatchNormalization(axis=bn_axis, trainable=trainable, name=bn_name_4)(shortcut)

    x = layers.add([x, shortcut])
    x = Activation('relu')(x)
    return x


def resnet_2D_v1(inputs, mode='train'):
    bn_axis = 3
#     if mode == 'train':
#         inputs = Input(shape=input_dim, name='input')
#     else:
#         inputs = Input(shape=(input_dim[0], None, input_dim[-1]), name='input')
    # ===============================================
    #            Convolution Block 1
    # ===============================================
    x1 = Conv2D(64, (7, 7),
                kernel_initializer='orthogonal',
                use_bias=False, trainable=True,
                kernel_regularizer=l2(weight_decay),
                padding='same',
                name='conv1_1/3x3_s1')(inputs)

    x1 = BatchNormalization(axis=bn_axis, name='conv1_1/3x3_s1/bn', trainable=True)(x1)
    x1 = Activation('relu')(x1)
    x1 = MaxPooling2D((2, 2), strides=(2, 2))(x1)

    # ===============================================
    #            Convolution Section 2
    # ===============================================
    x2 = conv_block_2D(x1, 3, [48, 48, 96], stage=2, block='a', strides=(1, 1), trainable=True)
    x2 = identity_block_2D(x2, 3, [48, 48, 96], stage=2, block='b', trainable=True)

    # ===============================================
    #            Convolution Section 3
    # ===============================================
    x3 = conv_block_2D(x2, 3, [96, 96, 128], stage=3, block='a', trainable=True)
    x3 = identity_block_2D(x3, 3, [96, 96, 128], stage=3, block='b', trainable=True)
    x3 = identity_block_2D(x3, 3, [96, 96, 128], stage=3, block='c', trainable=True)
    # ===============================================
    #            Convolution Section 4
    # ===============================================
    x4 = conv_block_2D(x3, 3, [128, 128, 256], stage=4, block='a', trainable=True)
    x4 = identity_block_2D(x4, 3, [128, 128, 256], stage=4, block='b', trainable=True)
    x4 = identity_block_2D(x4, 3, [128, 128, 256], stage=4, block='c', trainable=True)
    # ===============================================
    #            Convolution Section 5
    # ===============================================
    x5 = conv_block_2D(x4, 3, [256, 256, 512], stage=5, block='a', trainable=True)
    x5 = identity_block_2D(x5, 3, [256, 256, 512], stage=5, block='b', trainable=True)
    x5 = identity_block_2D(x5, 3, [256, 256, 512], stage=5, block='c', trainable=True)
    y = MaxPooling2D((3, 1), strides=(2, 1), name='mpool2')(x5)
    return inputs, y


def resnet_2D_v2(inputs, mode='train'):
    bn_axis = 3
#     if mode == 'train':
#         inputs = Input(shape=input_dim, name='input')
#     else:
#         inputs = Input(shape=(input_dim[0], None, input_dim[-1]), name='input')
    # ===============================================
    #            Convolution Block 1
    # ===============================================
    x1 = Conv2D(64, (7, 7), strides=(2, 2),
                kernel_initializer='orthogonal',
                use_bias=False, trainable=True,
                kernel_regularizer=l2(weight_decay),
                padding='same',
                name='conv1_1/3x3_s1')(inputs)

    x1 = BatchNormalization(axis=bn_axis, name='conv1_1/3x3_s1/bn', trainable=True)(x1)
    x1 = Activation('relu')(x1)
    x1 = MaxPooling2D((2, 2), strides=(2, 2))(x1)

    # ===============================================
    #            Convolution Section 2
    # ===============================================
    x2 = conv_block_2D(x1, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), trainable=True)
    x2 = identity_block_2D(x2, 3, [64, 64, 256], stage=2, block='b', trainable=True)
    x2 = identity_block_2D(x2, 3, [64, 64, 256], stage=2, block='c', trainable=True)
    # ===============================================
    #            Convolution Section 3
    # ===============================================
    x3 = conv_block_2D(x2, 3, [128, 128, 512], stage=3, block='a', trainable=True)
    x3 = identity_block_2D(x3, 3, [128, 128, 512], stage=3, block='b', trainable=True)
    x3 = identity_block_2D(x3, 3, [128, 128, 512], stage=3, block='c', trainable=True)
    # ===============================================
    #            Convolution Section 4
    # ===============================================
    x4 = conv_block_2D(x3, 3, [256, 256, 1024], stage=4, block='a', strides=(1, 1), trainable=True)
    x4 = identity_block_2D(x4, 3, [256, 256, 1024], stage=4, block='b', trainable=True)
    x4 = identity_block_2D(x4, 3, [256, 256, 1024], stage=4, block='c', trainable=True)
    # ===============================================
    #            Convolution Section 5
    # ===============================================
    x5 = conv_block_2D(x4, 3, [512, 512, 2048], stage=5, block='a', trainable=True)
    x5 = identity_block_2D(x5, 3, [512, 512, 2048], stage=5, block='b', trainable=True)
    x5 = identity_block_2D(x5, 3, [512, 512, 2048], stage=5, block='c', trainable=True)
    y = MaxPooling2D((3, 1), strides=(2, 1), name='mpool2')(x5)
    return inputs, y

In [5]:
import tensorflow.keras as keras
import tensorflow as tf
import tensorflow.keras.backend as K

In [11]:
class VladPooling(keras.layers.Layer):
    '''
    This layer follows the NetVlad, GhostVlad
    '''
    def __init__(self, mode, k_centers, g_centers=0, **kwargs):
        self.k_centers = k_centers
        self.g_centers = g_centers
        self.mode = mode
        super(VladPooling, self).__init__(**kwargs)

    def build(self, input_shape):
        self.cluster = self.add_weight(shape=[self.k_centers+self.g_centers, input_shape[0][-1]],
                                       name='centers',
                                       initializer='orthogonal')
        self.built = True

    def compute_output_shape(self, input_shape):
        assert input_shape
        return (input_shape[0][0], self.k_centers*input_shape[0][-1])

    def call(self, x):
        # feat : bz x W x H x D, cluster_score: bz X W x H x clusters.
        feat, cluster_score = x
        num_features = feat.shape[-1]

        # softmax normalization to get soft-assignment.
        # A : bz x W x H x clusters
        max_cluster_score = K.max(cluster_score, -1, keepdims=True)
        exp_cluster_score = K.exp(cluster_score - max_cluster_score)
        A = exp_cluster_score / K.sum(exp_cluster_score, axis=-1, keepdims = True)

        # Now, need to compute the residual, self.cluster: clusters x D
        A = K.expand_dims(A, -1)    # A : bz x W x H x clusters x 1
        feat_broadcast = K.expand_dims(feat, -2)    # feat_broadcast : bz x W x H x 1 x D
        feat_res = feat_broadcast - self.cluster    # feat_res : bz x W x H x clusters x D
        weighted_res = tf.multiply(A, feat_res)     # weighted_res : bz x W x H x clusters x D
        cluster_res = K.sum(weighted_res, [1, 2])

        if self.mode == 'gvlad':
            cluster_res = cluster_res[:, :self.k_centers, :]

        cluster_l2 = K.l2_normalize(cluster_res, -1)
        outputs = K.reshape(cluster_l2, [-1, int(self.k_centers) * int(num_features)])
        return outputs


def vggvox_resnet2d_icassp(inputs, num_class=8631, mode='train', args=None):
    
    # python predict.py --gpu 1 --net resnet34s --ghost_cluster 2 
    # --vlad_cluster 8 --loss softmax --resume
    
    net='resnet34s'
    loss='softmax'
    vlad_clusters=8
    ghost_clusters=2
    bottleneck_dim=512
    aggregation = 'gvlad'
    mgpu = 0

    if net == 'resnet34s':
        inputs, x = resnet_2D_v1(inputs, mode=mode)
    else:
        inputs, x = resnet_2D_v2(inputs, mode=mode)
    print(x)
    # ===============================================
    #            Fully Connected Block 1
    # ===============================================
    x_fc = keras.layers.Conv2D(bottleneck_dim, (7, 1),
                               strides=(1, 1),
                               activation='relu',
                               kernel_initializer='orthogonal',
                               use_bias=True, trainable=True,
                               kernel_regularizer=keras.regularizers.l2(weight_decay),
                               bias_regularizer=keras.regularizers.l2(weight_decay),
                               name='x_fc')(x)
    print(x_fc)

    # ===============================================
    #            Feature Aggregation
    # ===============================================
    if aggregation == 'avg':
        if mode == 'train':
            x = keras.layers.AveragePooling2D((1, 5), strides=(1, 1), name='avg_pool')(x)
            x = keras.layers.Reshape((-1, bottleneck_dim))(x)
        else:
            x = keras.layers.GlobalAveragePooling2D(name='avg_pool')(x)
            x = keras.layers.Reshape((1, bottleneck_dim))(x)

    elif aggregation == 'vlad':
        x_k_center = keras.layers.Conv2D(vlad_clusters, (7, 1),
                                         strides=(1, 1),
                                         kernel_initializer='orthogonal',
                                         use_bias=True, trainable=True,
                                         kernel_regularizer=keras.regularizers.l2(weight_decay),
                                         bias_regularizer=keras.regularizers.l2(weight_decay),
                                         name='vlad_center_assignment')(x)
        x = VladPooling(k_centers=vlad_clusters, mode='vlad', name='vlad_pool')([x_fc, x_k_center])

    elif aggregation == 'gvlad':
        x_k_center = keras.layers.Conv2D(vlad_clusters+ghost_clusters, (7, 1),
                                         strides=(1, 1),
                                         kernel_initializer='orthogonal',
                                         use_bias=True, trainable=True,
                                         kernel_regularizer=keras.regularizers.l2(weight_decay),
                                         bias_regularizer=keras.regularizers.l2(weight_decay),
                                         name='gvlad_center_assignment')(x)
        print(x_k_center)
        x = VladPooling(k_centers=vlad_clusters, g_centers=ghost_clusters, mode='gvlad', name='gvlad_pool')([x_fc, x_k_center])
        print(x)

    else:
        raise IOError('==> unknown aggregation mode')

    # ===============================================
    #            Fully Connected Block 2
    # ===============================================
    x = keras.layers.Dense(bottleneck_dim, activation='relu',
                           kernel_initializer='orthogonal',
                           use_bias=True, trainable=True,
                           kernel_regularizer=keras.regularizers.l2(weight_decay),
                           bias_regularizer=keras.regularizers.l2(weight_decay),
                           name='fc6')(x)
    
    x_l2 = keras.layers.Lambda(lambda x: K.l2_normalize(x, 1))(x)
    y = keras.layers.Dense(num_class,
                           kernel_initializer='orthogonal',
                           use_bias=False, trainable=True,
                           kernel_constraint=keras.constraints.unit_norm(),
                           kernel_regularizer=keras.regularizers.l2(weight_decay),
                           bias_regularizer=keras.regularizers.l2(weight_decay),
                           name='prediction')(x_l2)

    if mode == 'eval':
        y = keras.layers.Lambda(lambda x: keras.backend.l2_normalize(x, 1))(x)
        
    return y

#     model = keras.models.Model(inputs, y, name='vggvox_resnet2D_{}_{}'.format(loss, aggregation))

#     if mode == 'train':
#         if mgpu > 1:
#             model = ModelMGPU(model, gpus=mgpu)
#         # set up optimizer.
#         if args.optimizer == 'adam':  opt = keras.optimizers.Adam(lr=1e-3)
#         elif args.optimizer =='sgd':  opt = keras.optimizers.SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=True)
#         else: raise IOError('==> unknown optimizer type')
#         model.compile(optimizer=opt, loss=trnloss, metrics=['acc'])
#     return model

In [12]:
class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, [None, 257, None, 1])
        
        params = {'dim': (257, None, 1),
            'nfft': 512,
            'spec_len': 250,
            'win_length': 400,
            'hop_length': 160,
            'n_classes': 5994,
            'sampling_rate': 16000,
            'normalize': True,
        }
        self.logits = vggvox_resnet2d_icassp(self.X, num_class=2, mode='train')
        self.logits = tf.identity(self.logits, name = 'logits')

In [13]:
ckpt_path = 'v2/vggvox.ckpt'

In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()
sess.run(tf.global_variables_initializer())



Tensor("mpool2/MaxPool:0", shape=(?, 7, ?, 512), dtype=float32)
Tensor("x_fc/Relu:0", shape=(?, 1, ?, 512), dtype=float32)
Tensor("gvlad_center_assignment/BiasAdd:0", shape=(?, 1, ?, 10), dtype=float32)
Tensor("gvlad_pool/Reshape:0", shape=(?, 4096), dtype=float32)


In [9]:
model.logits

<tf.Tensor 'logits:0' shape=(?, 2) dtype=float32>

In [14]:
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
var_lists = [v for v in var_lists if 'prediction' not in v.name]

In [15]:
var_lists

[<tf.Variable 'conv1_1/3x3_s1/kernel:0' shape=(7, 7, 1, 64) dtype=float32>,
 <tf.Variable 'conv1_1/3x3_s1/bn/gamma:0' shape=(64,) dtype=float32>,
 <tf.Variable 'conv1_1/3x3_s1/bn/beta:0' shape=(64,) dtype=float32>,
 <tf.Variable 'conv1_1/3x3_s1/bn/moving_mean:0' shape=(64,) dtype=float32>,
 <tf.Variable 'conv1_1/3x3_s1/bn/moving_variance:0' shape=(64,) dtype=float32>,
 <tf.Variable 'conv2_a_1x1_reduce/kernel:0' shape=(1, 1, 64, 48) dtype=float32>,
 <tf.Variable 'conv2_a_1x1_reduce/bn/gamma:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_1x1_reduce/bn/beta:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_1x1_reduce/bn/moving_mean:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_1x1_reduce/bn/moving_variance:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_3x3/kernel:0' shape=(3, 3, 48, 48) dtype=float32>,
 <tf.Variable 'conv2_a_3x3/bn/gamma:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_3x3/bn/beta:0' shape=(48,) dtype=float32>,
 <tf.Variable 'conv2_a_3x3/bn/

In [16]:
var_lists = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, ckpt_path)

INFO:tensorflow:Restoring parameters from v2/vggvox.ckpt


In [13]:
import librosa
import numpy as np

# ===============================================
#       code from Arsha for loading data.
# ===============================================
def load_wav(vid_path, sr, mode='train'):
    wav, sr_ret = librosa.load(vid_path, sr=sr)
    assert sr_ret == sr
    if mode == 'train':
        extended_wav = np.append(wav, wav)
        if np.random.random() < 0.3:
            extended_wav = extended_wav[::-1]
        return extended_wav
    else:
        extended_wav = np.append(wav, wav[::-1])
        return extended_wav


def lin_spectogram_from_wav(wav, hop_length, win_length, n_fft=1024):
    linear = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # linear spectrogram
    return linear.T


def load_data(wav, win_length=400, sr=16000, hop_length=160, n_fft=512, spec_len=250, mode='train'):
    # wav = load_wav(path, sr=sr, mode=mode)
    linear_spect = lin_spectogram_from_wav(wav, hop_length, win_length, n_fft)
    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    freq, time = mag_T.shape
    if mode == 'train':
        if time < spec_len:
            spec_mag = np.pad(mag_T, ((0, 0), (0, spec_len - time)), 'constant')
        else:
            spec_mag = mag_T
    else:
        spec_mag = mag_T
    # preprocessing, subtract mean, divided by time-wise var
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    return (spec_mag - mu) / (std + 1e-5)


In [14]:
load_data(np.random.normal(size = (16000 * 10))).shape

(257, 1001)

In [11]:
from glob import glob
import numpy as np

files = glob('data/wav/enroll/*.wav')
len(files)

3

In [12]:
from glob import glob
import numpy as np

files = glob('data/wav/enroll/*.wav')
wavs = [load_data(wav, mode = 'eval') for wav in files]
[wav.shape for wav in wavs]

[(257, 2538), (257, 3006), (257, 1936)]

In [13]:
def pred(x):
    return sess.run(model.logits, feed_dict = {model.X: np.expand_dims([x], -1)})

r = [pred(wav) for wav in wavs]
r = np.concatenate(r)
r.shape

(3, 512)

In [14]:
from scipy.spatial.distance import cdist

cdist(r, r, metric='cosine')

array([[0.00000000e+00, 3.62801496e-01, 3.06282490e-01],
       [3.62801496e-01, 2.22044605e-16, 3.22202758e-01],
       [3.06282490e-01, 3.22202758e-01, 2.22044605e-16]])

In [15]:
import random

files = glob('../voxceleb/aac/**/*.m4a', recursive = True)
files = random.sample(files, 10000)
len(files)

10000

In [1]:
!rm -rf test-*.wav pickle-*.pkl

In [17]:
import pickle
import mp
from tqdm import tqdm
from pydub import AudioSegment

def loop(args):
    files = args[0]
    index = args[1]
    results = []
    for file in tqdm(files):
        
        audio = AudioSegment.from_file(file[1])
        audio.set_frame_rate(16000).set_channels(1).export(f'test-{index}.wav', format="wav")
        l = load_data(f'test-{index}.wav', mode = 'eval')
        results.append((file[0], file[1], l))
        
    with open(f'pickle-{index}.pkl', 'wb') as fopen:
        pickle.dump(results, fopen)

files_index = [(no, f) for no, f in enumerate(files)]
r = mp.multiprocessing(files_index, loop, cores = 50)

100%|██████████| 200/200 [02:30<00:00,  1.33it/s]
100%|██████████| 200/200 [02:30<00:00,  1.37it/s]
 98%|█████████▊| 197/200 [02:30<00:02,  1.45it/s]
100%|██████████| 200/200 [02:31<00:00,  1.32it/s]
 98%|█████████▊| 195/200 [02:31<00:03,  1.32it/s]
100%|██████████| 200/200 [02:32<00:00,  1.32it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
 98%|█████████▊| 197/200 [02:32<00:02,  1.22it/s]
100%|██████████| 200/200 [02:32<00:00,  1.10it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
 98%|█████████▊| 197/200 [02:32<00:02,  1.22it/s]
100%|██████████| 200/200 [02:32<00:00,  1.31it/s]
100%|█████████▉| 199/200 [02:33<00:00,  1.24it/s]
100%|██████████| 200/200 [02:33<00:00,  1.31it/s]
100%|█████████▉| 199/200 [02:33<00:00,  1.26it/s]
100%|██████████| 200/200 [02:33<00:00,  1.30it/s]

100%|██████████| 200/200 [02:33<00:00,  1.30it/s]

TypeError: 'NoneType' object is not iterable

In [18]:
!rm -rf test-*.wav

In [19]:
import pandas as pd

df = pd.read_csv('../voxceleb/vox2_meta.csv')
df = df[df['Set '] == 'test ']
speakers = df['VoxCeleb2 ID '].unique().tolist()
speakers = [s.strip() for s in speakers]

In [20]:
from collections import defaultdict

speakers_idx = defaultdict(list)

for speaker in speakers:
    for file in files:
        if speaker in file:
            speakers_idx[speaker].append(file)

In [21]:
from tqdm import tqdm


k = 10
labels = []

def get_id(file):
    return file.split('/')[3]

for file in tqdm(files):
    left_speaker = get_id(file)
    for speaker in speakers:
        if left_speaker == speaker:
            label = 1
        else:
            label = 0
        samples = random.sample(speakers_idx[speaker], min(k, len(speakers_idx[speaker])))
        for s in samples:
            labels.append((label, file, s))

100%|██████████| 10000/10000 [00:17<00:00, 566.86it/s]


In [22]:
random.shuffle(labels)

In [23]:
import itertools
import pickle

pickles = glob('pickle-*.pkl')

pooled = []
for p in pickles:
    with open(p, 'rb') as fopen:
        pooled.append(pickle.load(fopen))
        
pooled = list(itertools.chain(*pooled))

In [24]:
mapping = {i[1]: pred(i[2]) for i in tqdm(pooled)}

100%|██████████| 10000/10000 [04:36<00:00, 36.11it/s]


In [25]:
scores, ls = [], []

for i in tqdm(range(len(labels))):
    ls.append(labels[i][0])
    scores.append(np.sum(mapping[labels[i][1]][0] * mapping[labels[i][2]][0]))

100%|██████████| 11710000/11710000 [02:17<00:00, 85377.21it/s]


In [26]:
def calculate_eer(y, y_score):
    
    from scipy.optimize import brentq
    from sklearn.metrics import roc_curve
    from scipy.interpolate import interp1d

    fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    thresh = interp1d(fpr, thresholds)(eer)
    return eer, thresh

In [27]:
calculate_eer(ls, scores)

(0.044729718189581553, array(0.76060986))