In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/voxceleb/voxceleb2-test-sample.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/voxceleb/voxceleb2-test-labels.pkl

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['MALAYA_USE_HUGGINGFACE'] = 'true'

In [3]:
import malaya_speech
import json
import pickle
from tqdm import tqdm
import tensorflow as tf
import malaya_speech.train.model.conformer as conformer

In [4]:
with open('/home/husein/youtube/voxceleb2-test-sample.json') as fopen:
    sample_files = json.load(fopen)

In [5]:
with open('/home/husein/youtube/voxceleb2-test-labels.pkl', 'rb') as fopen:
    labels = pickle.load(fopen)

In [6]:
featurizer = malaya_speech.tf_featurization.STTFeaturizer(
    normalize_per_feature = True
)
X = tf.compat.v1.placeholder(tf.float32, [None, None], name = 'X_placeholder')
X_len = tf.compat.v1.placeholder(tf.int32, [None], name = 'X_len_placeholder')
batch_size = tf.shape(X)[0]
features = tf.TensorArray(dtype = tf.float32, size = batch_size, dynamic_size = True, infer_shape = False)
features_len = tf.TensorArray(dtype = tf.int32, size = batch_size)

init_state = (0, features, features_len)

def condition(i, features, features_len):
    return i < batch_size

def body(i, features, features_len):
    f = featurizer(X[i, :X_len[i]])
    f_len = tf.shape(f)[0]
    return i + 1, features.write(i, f), features_len.write(i, f_len)

_, features, features_len = tf.while_loop(condition, body, init_state)
features_len = features_len.stack()
padded_features = tf.TensorArray(dtype = tf.float32, size = batch_size)
padded_lens = tf.TensorArray(dtype = tf.int32, size = batch_size)
maxlen = tf.reduce_max(features_len)

init_state = (0, padded_features, padded_lens)

def condition(i, padded_features, padded_lens):
    return i < batch_size

def body(i, padded_features, padded_lens):
    f = features.read(i)
    len_f = tf.shape(f)[0]
    f = tf.pad(f, [[0, maxlen - tf.shape(f)[0]], [0,0]])
    return i + 1, padded_features.write(i, f), padded_lens.write(i, len_f)

_, padded_features, padded_lens = tf.while_loop(condition, body, init_state)
padded_features = padded_features.stack()
padded_lens = padded_lens.stack()
padded_lens.set_shape((None,))
padded_features.set_shape((None, None, 80))
padded_features = tf.expand_dims(padded_features, -1)
padded_features, padded_lens

(<tf.Tensor 'ExpandDims:0' shape=(?, ?, 80, 1) dtype=float32>,
 <tf.Tensor 'TensorArrayStack_2/TensorArrayGatherV3:0' shape=(?,) dtype=int32>)

In [7]:
padded_features = tf.identity(padded_features, name = 'padded_features')
padded_lens = tf.identity(padded_lens, name = 'padded_lens')

In [8]:
config = malaya_speech.config.conformer_base_encoder_config
conformer_model = conformer.Model(
    kernel_regularizer=None, bias_regularizer=None, **config
)

In [9]:
seq = conformer_model(padded_features)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [10]:
embedding_dim = 512
first_token_tensor = tf.squeeze(seq[:, 0:1, :], axis=1)
pooled_output = tf.keras.layers.Dense(embedding_dim, activation=None,
                                   use_bias=True, trainable=True)(first_token_tensor)
pooled_output

<tf.Tensor 'dense/BiasAdd:0' shape=(?, 512) dtype=float32>

In [11]:
y = tf.keras.layers.Lambda(lambda x: tf.keras.backend.l2_normalize(x, 1))(pooled_output)
y

<tf.Tensor 'lambda/l2_normalize:0' shape=(?, 512) dtype=float32>

In [12]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [13]:
var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
saver = tf.train.Saver(var_list = var_list)
saver.restore(sess, 'conformer-base-voxceleb/model.ckpt-1675000')

INFO:tensorflow:Restoring parameters from conformer-base-voxceleb/model.ckpt-1675000


In [14]:
unique_files = []
for l in labels:
    unique_files.extend(l[1:])
    
unique_files = list(set(unique_files))

In [15]:
f = unique_files[0]
y_, _ = malaya_speech.load(f)

In [16]:
%%time

sess.run(y, feed_dict = {X: [y_], X_len: [len(y_)]})[0]

CPU times: user 3.67 s, sys: 209 ms, total: 3.88 s
Wall time: 3 s


array([-6.89724609e-02, -3.41668800e-02,  1.17649697e-02, -1.69185493e-02,
       -5.91135807e-02, -2.39978246e-02,  5.83983085e-04,  4.95173261e-02,
       -1.70719624e-02, -1.10947946e-02,  4.80053276e-02, -3.50449383e-02,
        5.55856079e-02,  4.35475670e-02, -2.47186664e-02, -1.64312571e-02,
        8.25982243e-02, -5.65200374e-02,  2.26350799e-02, -3.72865200e-02,
       -2.73708701e-02,  3.23672295e-02, -1.50604751e-02,  4.46702018e-02,
        2.56585553e-02, -2.41122022e-02,  3.23291798e-03, -5.92382066e-02,
        4.89351638e-02,  1.85456909e-02, -3.37917097e-02,  5.19451462e-02,
       -3.47043835e-02, -8.33684206e-03, -2.23122630e-02,  6.37616357e-03,
        4.61544609e-04,  1.05786426e-02,  7.47852549e-02,  2.40048976e-03,
        3.59942131e-02,  7.58860633e-02,  7.15874061e-02, -7.22227395e-02,
       -3.76559794e-02,  6.61615357e-02,  3.20036970e-02, -2.82124709e-02,
        4.59266976e-02,  8.52411555e-04,  2.66959537e-02,  3.37800793e-02,
       -1.07908081e-02, -

In [17]:
vectors = {}
for f in tqdm(unique_files):
    y_, _ = malaya_speech.load(f)
    v = sess.run(y, feed_dict = {X: [y_], X_len: [len(y_)]})[0]
    # v = model.vectorize([y])[0]///
    vectors[f] = v

 28%|██▊       | 9982/36237 [39:33<1:49:35,  3.99it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 50%|████▉     | 17975/36237 [1:10:30<1:09:22,  4.39it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 67%|██████▋   | 24402/36237 [1:35:07<41:16,  4.78it/s]  IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--No

In [18]:
import numpy as np

scores, ls = [], []

for i in tqdm(range(len(labels))):
    if labels[i][1] in vectors and labels[i][2] in vectors:
        ls.append(labels[i][0])
        scores.append(np.sum(vectors[labels[i][1]] * vectors[labels[i][2]]))

100%|██████████| 5900000/5900000 [00:51<00:00, 114230.16it/s]


In [19]:
len(scores), len(vectors)

(5900000, 36237)

In [20]:
def calculate_eer(y, y_score):
    
    from scipy.optimize import brentq
    from sklearn.metrics import roc_curve
    from scipy.interpolate import interp1d

    fpr, tpr, thresholds = roc_curve(y, y_score, pos_label=1)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    thresh = interp1d(fpr, thresholds)(eer)
    return eer, thresh

In [21]:
calculate_eer(ls, scores)

(0.06906000000015512, array(0.46428116))