### Robustness of a Speaker Verifier agasin Fake Inputs

In [1]:
from sklearn.metrics import roc_curve
import tensorflow as tf
import numpy as np
import random
import sys
import os

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format

In [2]:
! module load ffmpeg/intel/3.2.2

In [3]:
sys.path.append("/beegfs/mm11333/dl-master-voices/")

In [4]:
from models.verifier.vggvox import VggVox
from models.verifier.xvector import XVector
from models.verifier.resnet50vox import ResNet50Vox
from models.verifier.resnet34vox import ResNet34Vox

In [5]:
from helpers.audio import decode_audio, get_tf_spectrum, get_tf_filterbanks

In [6]:
from helpers.dataset import load_mv_data

#### > Retrieve thresholds

In [7]:
nets = ['xvector/v001', 'vggvox/v004', 'resnet34vox/v001', 'resnet50vox/v001']

In [8]:
def find_thr_far(far, value):
    return np.argmin(np.abs(value - far))

In [9]:
vox1_test_results = {}
for net in nets:
    vox1_test_results[net] = pd.read_csv(os.path.join('../data/pt_models', net, 'test_vox1_sv_test.csv'))

In [10]:
thrs_eer = {}
thrs_far1 = {}
for net in nets:
    far, tpr, thresholds = roc_curve(vox1_test_results[net]['target'].values, vox1_test_results[net]['similarity'].values)
    frr = 1 - tpr
    idx_eer = np.argmin(np.abs(far - frr))
    idx_far1 = find_thr_far(far, 0.01)
    thrs_eer[net] = thresholds[idx_eer]
    thrs_far1[net] = thresholds[idx_far1]
    print(net, thresholds[idx_eer], thresholds[idx_far1], sep='\t')

xvector/v001	0.8433434963226318	0.8793687224388123
vggvox/v004	0.999350905418396	0.9997615218162536
resnet34vox/v001	0.8419269919395447	0.886158287525177
resnet50vox/v001	0.9969538450241088	0.9979440569877625


#### > Load speaker verifiers

In [11]:
net = 'resnet34vox/v001'
n_seconds = 3
classes=5205
loss='softmax'
aggregation='gvlad'
vlad_clusters=12
ghost_clusters=2
weight_decay=1e-4
sample_rate=16000
n_seconds=3
mode = 'spectrum'
mv_base_path='/beegfs/mm11333/data/voxceleb2'
mv_meta='../data/ad_voxceleb12/vox2_mv_data.npz'
audio_meta='../data/ad_voxceleb12/vox12_meta_data.csv'

In [12]:
available_nets = {'xvector': XVector, 'vggvox': VggVox, 'resnet50vox': ResNet50Vox, 'resnet34vox': ResNet34Vox}

In [13]:
model = available_nets[net.split('/')[0]](id=int(net.split('/')[1].replace('v','')), n_seconds=n_seconds, sample_rate=sample_rate)
model.build(classes=classes, loss=loss, aggregation=aggregation, vlad_clusters=vlad_clusters, ghost_clusters=ghost_clusters, weight_decay=weight_decay, training_phase=False)
model.dir = '.' + model.dir
model.load()

> created model folder ./data/pt_models/resnet34vox/v001
> building resnet34vox model on 5205 classes
> built resnet34vox model on 5205 classes
> loading resnet34vox model
> loaded model from ../data/pt_models/resnet34vox/v001


#### > Test speaker verifiers against gan samples

In [14]:
x_mv_test, y_mv_test, male_x_mv_test, female_x_mv_test = load_mv_data(mv_meta, mv_base_path, audio_meta)

Loading master voice data
> found 100000 paths from 1000 users
> loaded 10000 / 10000 audio files


In [15]:
audio_1 = decode_audio('/beegfs/mm11333/dl-master-voices/data/vs_mv_data/gan_f-f_sv/v0/audio_original_female_0.wav', tgt_sample_rate=sample_rate).reshape((1, -1, 1)) 
audio_2 = decode_audio('/beegfs/mm11333/data/voxceleb1/test/id10293/gegIAYxfpVA/00008.wav', tgt_sample_rate=sample_rate).reshape((1, -1, 1)) 
input_1 = get_tf_spectrum(audio_1) if mode == 'spectrum' else get_tf_filterbanks(audio_1)
input_2 = get_tf_spectrum(audio_2) if mode == 'spectrum' else get_tf_filterbanks(audio_2)
embs_1 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_1))
embs_2 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_2))
sims_1 = float(tf.keras.layers.Dot(axes=1, normalize=True)([embs_1, embs_2])[0][0])

In [16]:
sims_1, thrs_eer[net], thrs_far1[net], sims_1 >= thrs_eer[net], sims_1 >= thrs_far1[net] 

(0.7585431933403015, 0.8419269919395447, 0.886158287525177, False, False)

In [17]:
mv_analysis_data = np.load(mv_meta)
mv_paths = [os.path.join(mv_base_path, path) for path in mv_analysis_data['x_test']]
mv_labels = mv_analysis_data['y_test']
print('> found', len(mv_paths), 'paths from', len(np.unique(mv_labels)), 'users')

> found 100000 paths from 1000 users


In [18]:
imps_users = 0
curs_users = 0
gan_audio_path = '/beegfs/mm11333/dl-master-voices/data/vs_mv_data/real_f-f_mv/v0/mv_00.wav'
samples_per_user = int(len(mv_paths) // len(np.unique(mv_labels)))
for class_index, _ in enumerate(np.unique(y_mv_test)):
    if class_index in female_x_mv_test:
        class_audio = x_mv_test[class_index*10:(class_index+1)*10]
        all_sims = []
        for audio_index, audio_2 in enumerate(class_audio):
            audio_1 = decode_audio(gan_audio_path, tgt_sample_rate=sample_rate).reshape((1, -1, 1)) 
            input_1 = get_tf_spectrum(audio_1) if mode == 'spectrum' else get_tf_filterbanks(audio_1)
            input_2 = get_tf_spectrum(audio_2) if mode == 'spectrum' else get_tf_filterbanks(audio_2)
            embs_1 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_1))
            embs_2 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_2))
            all_sims.append(float(tf.keras.layers.Dot(axes=1, normalize=True)([embs_1, embs_2])[0][0]))
        sims_1 = np.max(all_sims)
        if sims_1 >= thrs_eer[net]:
            imps_users += 1
        curs_users += 1
        print(imps_users / curs_users, '>', 'User', class_index, 'Sex', 'M' if class_index in male_x_mv_test else 'F', '(', class_index*10, (class_index+1)*10-1, ')', sims_1, thrs_eer[net], thrs_far1[net], sims_1 >= thrs_eer[net], sims_1 >= thrs_far1[net])

0.0 > User 0 Sex F ( 0 9 ) 0.8362574577331543 0.8419269919395447 0.886158287525177 False False
0.5 > User 3 Sex F ( 30 39 ) 0.8711940050125122 0.8419269919395447 0.886158287525177 True False
0.6666666666666666 > User 4 Sex F ( 40 49 ) 0.8577773571014404 0.8419269919395447 0.886158287525177 True False
0.75 > User 6 Sex F ( 60 69 ) 0.8523212671279907 0.8419269919395447 0.886158287525177 True False
0.6 > User 8 Sex F ( 80 89 ) 0.8238371014595032 0.8419269919395447 0.886158287525177 False False
0.6666666666666666 > User 9 Sex F ( 90 99 ) 0.8920130729675293 0.8419269919395447 0.886158287525177 True True
0.5714285714285714 > User 13 Sex F ( 130 139 ) 0.8327767252922058 0.8419269919395447 0.886158287525177 False False
0.625 > User 14 Sex F ( 140 149 ) 0.8973631262779236 0.8419269919395447 0.886158287525177 True True
0.6666666666666666 > User 16 Sex F ( 160 169 ) 0.8798030614852905 0.8419269919395447 0.886158287525177 True False
0.7 > User 18 Sex F ( 180 189 ) 0.8932753205299377 0.841926991939

In [None]:
imps_users = 0
curs_users = 0
gan_audio_path = '/beegfs/mm11333/dl-master-voices/data/vs_mv_data/real_f-f_mv/v0/mv_00.wav'
samples_per_user = int(len(mv_paths) // len(np.unique(mv_labels)))
for user in os.listdir('/beegfs/mm11333/data/voxceleb1/test'):
    try:
        videos = random.sample(os.listdir('/beegfs/mm11333/data/voxceleb1/test' + '/' + user), 10)
        paths = ['/beegfs/mm11333/data/voxceleb1/test' + '/' + user + '/' + video + '/' + random.choice(os.listdir('/beegfs/mm11333/data/voxceleb1/test' + '/' + user + '/' + video)) for video in videos]
        all_sims = []
        for audio_index, path_2 in enumerate(paths):
            audio_1 = decode_audio(gan_audio_path, tgt_sample_rate=sample_rate).reshape((1, -1, 1)) 
            audio_2 = decode_audio(path_2, tgt_sample_rate=sample_rate).reshape((1, -1, 1)) 
            input_1 = get_tf_spectrum(audio_1) if mode == 'spectrum' else get_tf_filterbanks(audio_1)
            input_2 = get_tf_spectrum(audio_2) if mode == 'spectrum' else get_tf_filterbanks(audio_2)
            embs_1 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_1))
            embs_2 = tf.keras.layers.Lambda(lambda emb1: tf.keras.backend.l2_normalize(emb1, 1))(model.embed(input_2))
            all_sims.append(float(tf.keras.layers.Dot(axes=1, normalize=True)([embs_1, embs_2])[0][0]))
            print(user, audio_index, path_2, all_sims[-1])
        sims_1 = np.max(all_sims)
        if sims_1 >= thrs_eer[net]:
            imps_users += 1
        curs_users += 1
        print(imps_users / curs_users, '>', 'User', user, sims_1, thrs_eer[net], thrs_far1[net], sims_1 >= thrs_eer[net], sims_1 >= thrs_far1[net])
    except:
        print('> skipped', user)