In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/voxceleb/voxceleb2-test-sample.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/voxceleb/voxceleb2-test-labels.pkl

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['MALAYA_USE_HUGGINGFACE'] = 'true'

In [3]:
import malaya_speech
import json
import pickle
from tqdm import tqdm
import tensorflow as tf
import malaya_speech.train.model.conformer as conformer

In [4]:
with open('/home/husein/youtube/voxceleb2-test-sample.json') as fopen:
    sample_files = json.load(fopen)

In [5]:
with open('/home/husein/youtube/voxceleb2-test-labels.pkl', 'rb') as fopen:
    labels = pickle.load(fopen)

In [6]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = featurizer(X[i, :X_len[i]])
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features, padded_lens)

def condition(i, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features, padded_lens):
    f = features.read(i)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features.set_shape((None, None, 80))
padded_features = tf.expand_dims(padded_features, -1)
padded_features, padded_lens

(<tf.Tensor 'ExpandDims:0' shape=(?, ?, 80, 1) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [7]:
padded_features = tf.identity(padded_features, name = 'padded_features')
padded_lens = tf.identity(padded_lens, name = 'padded_lens')

In [8]:
config = malaya_speech.config.conformer_tiny_encoder_config
conformer_model = conformer.Model(
    kernel_regularizer=None, bias_regularizer=None, **config
)

In [9]:
seq = conformer_model(padded_features)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
embedding_dim = 512
first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
pooled_output = tf.keras.layers.Dense(embedding_dim, activation=None,
                                   use_bias=True, trainable=True)(first_token_tensor)
pooled_output

<tf.Tensor 'dense/BiasAdd:0' shape=(?, 512) dtype=float32>

In [11]:
y = tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, 1))(pooled_output)
y

<tf.Tensor 'lambda/l2_normalize:0' shape=(?, 512) dtype=float32>

In [12]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [13]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'conformer-tiny-voxceleb/model.ckpt-1250002')

INFO:tensorflow:Restoring parameters from conformer-tiny-voxceleb/model.ckpt-1250002


In [14]:
unique_files = []
for l in labels:
    unique_files.extend(l[1:])
    
unique_files = list(set(unique_files))  

In [15]:
f = unique_files[0]
y_, _ = malaya_speech.load(f)

In [16]:
%%time

sess.run([pooled_output, y], feed_dict = {X: [y_], X_len: [len(y_)]})

CPU times: user 2.05 s, sys: 71.3 ms, total: 2.12 s
Wall time: 1.83 s


[array([[-0.27380615,  1.607637  , -3.288402  ,  5.8010764 , -3.437999  ,
         -1.6338171 , -4.731016  ,  1.6971414 , -0.32794702, -2.806893  ,
          1.2548681 , -0.25408542,  3.234968  ,  1.2450624 , -0.59769577,
         -0.27951103,  1.6363953 , -1.9033945 ,  6.985413  , -5.0791793 ,
         -2.2417417 ,  1.174528  ,  1.3528355 , -2.1078784 ,  3.1129322 ,
          3.296452  ,  4.5632777 ,  8.034816  ,  3.383371  , -6.3480396 ,
         -1.6283656 ,  2.9684076 , -5.8292046 , -4.7081738 ,  1.6194836 ,
         10.09145   , -8.597065  , -1.7172351 ,  1.8048071 , -0.7224511 ,
         -0.07827301, -1.9072077 , -0.22544223, -1.5183424 , -3.5404978 ,
          4.245466  , -1.0593115 ,  1.9407766 , -0.7766394 , -8.006143  ,
         -7.1185207 , -0.30076015, -3.4765565 , -5.9257374 , -6.3389893 ,
         -2.4199643 ,  0.633995  ,  4.055876  , -0.75152355,  6.7704477 ,
          3.3773806 ,  0.23319803,  4.494203  ,  1.4519562 , -1.4837874 ,
         -2.5624576 , -3.3343213 ,  4.

In [17]:
vectors = {}
for f in tqdm(unique_files):
    y_, _ = malaya_speech.load(f)
    v = sess.run(y, feed_dict = {X: [y_], X_len: [len(y_)]})[0]
    # v = model.vectorize([y])[0]
    vectors[f] = v

 19%|█▊        | 6737/36237 [14:13<1:14:02,  6.64it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 45%|████▍     | 16128/36237 [34:33<47:05,  7.12it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 63%|██████▎   | 22742/36237 [48:29<28:53,  7.79it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--Notebo

In [18]:
import numpy as np

scores, ls = [], []

for i in tqdm(range(len(labels))):
    if labels[i][1] in vectors and labels[i][2] in vectors:
        ls.append(labels[i][0])
        scores.append(np.sum(vectors[labels[i][1]] * vectors[labels[i][2]]))

100%|██████████| 5900000/5900000 [01:07<00:00, 87852.55it/s] 


In [19]:
len(scores)

5900000

In [20]:
def calculate_eer(y, y_score):
    
    from scipy.optimize import brentq
    from sklearn.metrics import roc_curve
    from scipy.interpolate import interp1d

    fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    thresh = interp1d(fpr, thresholds)(eer)
    return eer, thresh

In [21]:
calculate_eer(ls, scores)

(0.08687999999999804, array(0.40151393))