# Benchmark of Speaker Verification Models

In [1]:
from sklearn.metrics import roc_curve
import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format

## Parameters

In [2]:
nets = ['xvector/v001', 'vggvox/v004', 'resnet34vox/v001', 'resnet50vox/v001']

## Benchmark on Vox1-Test

In [3]:
def find_thr_far(far, value):
    return np.argmin(np.abs(value - far))

In [4]:
def format_val(x):
    return str('{:2f}'.format(x))

In [5]:
def count_far(targets, similarities, thr):
    fars = 0
    count = 0
    for t, s in zip(targets, similarities):
        if t == 0:
            if s >= thr:
                fars += 1
            count += 1
    return fars / count

In [6]:
def count_frr(targets, similarities, thr):
    frrs = 0
    count = 0
    for t, s in zip(targets, similarities):
        if t == 1:
            if s < thr:
                frrs += 1
            count += 1
    return frrs / count

In [7]:
vox1_test_results = {}
for net in nets:
    vox1_test_results[net] = pd.read_csv(os.path.join('../data/pt_models', net, 'test_vox1_sv_test.csv'))

In [8]:
results = {}
for net in nets:
    far, tpr, thresholds = roc_curve(vox1_test_results[net]['target'].values, vox1_test_results[net]['similarity'].values)
    frr = 1 - tpr
    idx_eer = np.argmin(np.abs(far - frr))
    idx_far1 = find_thr_far(far, 0.01)
    print(net, thresholds[idx_eer])
    results[net] = [format_val(np.mean([far[idx_eer], frr[idx_eer]])), format_val(far[idx_eer]), format_val(frr[idx_eer]), format_val(thresholds[idx_eer]), 
                    format_val(far[idx_far1]), format_val(frr[idx_far1]), format_val(thresholds[idx_far1]), len(vox1_test_results[net].index)]

xvector/v001 0.8433434963226318
vggvox/v004 0.999350905418396
resnet34vox/v001 0.8419269919395447
resnet50vox/v001 0.9969538450241088


In [14]:
df = pd.DataFrame.from_dict(results, orient='index', columns=['eer', 'eer-far', 'eer-frr', 'eer-thr', 'far1-far', 'far1-frr', 'far1-thr', 'no-trials'])
columns = [('EER','eer'), ('EER','eer-far'), ('EER','eer-frr'), ('EER','eer-thr'), ('FAR1%','far1-far'), ('FAR1%','far1-frr'), ('FAR1%','far1-thr'), ('', 'no-trials')]
df.columns = pd.MultiIndex.from_tuples(columns)
df.style.set_properties(**{'width':'10em', 'text-align':'center'}).set_table_styles([dict(selector="th", props=[('text-align', 'center')])])

Unnamed: 0_level_0,EER,EER,EER,EER,FAR1%,FAR1%,FAR1%,Unnamed: 8_level_0
Unnamed: 0_level_1,eer,eer-far,eer-frr,eer-thr,far1-far,far1-frr,far1-thr,no-trials
resnet34vox/v001,0.088309,0.088282,0.088335,0.841927,0.009968,0.327041,0.886158,37720
xvector/v001,0.065403,0.065376,0.065429,0.843343,0.010021,0.222481,0.879369,37720
resnet50vox/v001,0.06315,0.06315,0.06315,0.996954,0.010021,0.270467,0.997944,37720
vggvox/v004,0.343187,0.343213,0.34316,0.996056,0.010021,0.916914,0.998519,37720


## Benchmark on Vox2-MasterVoice-Train

#### Individual Benchmark

In [92]:
setups = [(1, 'raw'), (10, 'any'), (10, 'avg')]

In [93]:
vox2_mv_train_results = {}
for template_dim, policy_name in setups:
    vox2_mv_train_results[policy_name] = {}
    for net in nets:
        vox2_mv_train_results[policy_name][net] = pd.read_csv(os.path.join('../data/pt_models', net, 'test_vox2_mv_train_' + policy_name + '_' + str(template_dim) + '.csv'))
        if policy_name == 'avg':
            vox2_mv_train_results[policy_name][net]['score'] = vox2_mv_train_results[policy_name][net]['score'].apply(lambda x: float(x.split('(')[1].split(',')[0]))
        else:
            vox2_mv_train_results[policy_name][net]['score'] = vox2_mv_train_results[policy_name][net]['score'].apply(lambda x: float(x))

In [94]:
results = {}
for template_dim, policy_name in setups:
    results[policy_name] = {}
    for net in nets:
        far, tpr, thresholds = roc_curve(vox2_mv_train_results[policy_name][net]['target'].values, vox2_mv_train_results[policy_name][net]['score'].values)
        frr = 1 - tpr
        idx_eer = np.argmin(np.abs(far - frr))
        idx_far1 = find_thr_far(far, 0.01)
        results[policy_name][net] = [policy_name] + [net] + [format_val(np.mean([far[idx_eer], frr[idx_eer]])), format_val(far[idx_eer]), format_val(frr[idx_eer]), format_val(thresholds[idx_eer]), 
                                     format_val(far[idx_far1]), format_val(frr[idx_far1]), format_val(thresholds[idx_far1]), len(vox1_test_results[net].index)]

In [95]:
dfs = []
for template_dim, policy_name in setups:
    df = pd.DataFrame.from_dict(results[policy_name], orient='index', columns=['policy', 'net', 'eer', 'eer-far', 'eer-frr', 'eer-thr', 'far1-far', 'far1-frr', 'far1-thr', 'no-trials']) 
    columns = [('-', 'policy'), ('-', 'net'), ('EER','eer'), ('EER','eer-far'), ('EER','eer-frr'), ('EER','eer-thr'), ('FAR1%','far1-far'), ('FAR1%','far1-frr'), ('FAR1%','far1-thr'), ('', 'no-trials')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    dfs.append(df)

In [96]:
df = pd.concat(dfs).reset_index()
del df['index']
df.style.set_properties(**{'width':'10em', 'text-align':'center'}).set_table_styles([dict(selector="th", props=[('text-align', 'center')])])

Unnamed: 0_level_0,-,-,EER,EER,EER,EER,FAR1%,FAR1%,FAR1%,Unnamed: 10_level_0
Unnamed: 0_level_1,policy,net,eer,eer-far,eer-frr,eer-thr,far1-far,far1-frr,far1-thr,no-trials
0,raw,resnet34vox/v001,0.111,0.1112,0.1108,0.842719,0.01,0.4272,0.897524,37720
1,raw,xvector/v001,0.0846,0.0848,0.0844,0.828743,0.01,0.242,0.873502,37720
2,raw,resnet50vox/v001,0.0914,0.0912,0.0916,0.996621,0.01,0.332,0.997906,37720
3,raw,vggvox/v004,0.2824,0.2824,0.2824,0.975831,0.01,0.8892,0.989723,37720
4,any,resnet34vox/v001,0.0456,0.0456,0.0456,0.89511,0.0104,0.136,0.917708,37720
5,any,xvector/v001,0.032,0.032,0.032,0.879298,0.0096,0.0672,0.900059,37720
6,any,resnet50vox/v001,0.0352,0.0352,0.0352,0.998031,0.01,0.09,0.998446,37720
7,any,vggvox/v004,0.2172,0.2172,0.2172,0.985993,0.01,0.8148,0.992898,37720
8,avg,resnet34vox/v001,0.0572,0.0572,0.0572,0.89502,0.01,0.1948,0.922614,37720
9,avg,xvector/v001,0.0428,0.0424,0.0432,0.880241,0.01,0.086,0.902676,37720


#### Benchmark based on Raw Policy thresholds

In [77]:
setups = [(1, 'raw'), (10, 'any'), (10, 'avg')]

In [78]:
vox2_mv_train_results = {}
for template_dim, policy_name in setups:
    vox2_mv_train_results[policy_name] = {}
    for net in nets:
        vox2_mv_train_results[policy_name][net] = pd.read_csv(os.path.join('../data/pt_models', net, 'test_vox2_mv_train_' + policy_name + '_' + str(template_dim) + '.csv'))
        if policy_name == 'avg':
            vox2_mv_train_results[policy_name][net]['score'] = vox2_mv_train_results[policy_name][net]['score'].apply(lambda x: float(x.split('(')[1].split(',')[0]))
        else:
            vox2_mv_train_results[policy_name][net]['score'] = vox2_mv_train_results[policy_name][net]['score'].apply(lambda x: float(x))

In [79]:
ref_thrs = {}
for net in nets:
    far, tpr, thresholds = roc_curve(vox2_mv_train_results['raw'][net]['target'].values, vox2_mv_train_results['raw'][net]['score'].values)
    frr = 1 - tpr
    idx_eer = np.argmin(np.abs(far - frr))
    idx_far1 = find_thr_far(far, 0.01)
    ref_thrs[net] = {'eer': thresholds[idx_eer], 'far1': thresholds[idx_far1]}

In [80]:
results = {}
for template_dim, policy_name in setups:
    results[policy_name] = {}
    for net in nets:
        far_eer = count_far(vox2_mv_train_results[policy_name][net]['target'].values, vox2_mv_train_results[policy_name][net]['score'].values, ref_thrs[net]['eer'])
        frr_eer = count_frr(vox2_mv_train_results[policy_name][net]['target'].values, vox2_mv_train_results[policy_name][net]['score'].values, ref_thrs[net]['eer'])
        far_far1 = count_far(vox2_mv_train_results[policy_name][net]['target'].values, vox2_mv_train_results[policy_name][net]['score'].values, ref_thrs[net]['far1'])
        frr_far1 = count_frr(vox2_mv_train_results[policy_name][net]['target'].values, vox2_mv_train_results[policy_name][net]['score'].values, ref_thrs[net]['far1'])
        results[policy_name][net] = [policy_name] + [net] + [format_val(far_eer), format_val(frr_eer), format_val(ref_thrs[net]['eer']), 
                                     format_val(far_far1), format_val(frr_far1), format_val(ref_thrs[net]['far1']), len(vox1_test_results[net].index)]

In [81]:
dfs = []
for template_dim, policy_name in setups:
    df = pd.DataFrame.from_dict(results[policy_name], orient='index', columns=['policy', 'net', 'eer-far', 'eer-frr', 'eer-thr', 'far1-far', 'far1-frr', 'far1-thr', 'no-trials']) 
    columns = [('-', 'policy'), ('-', 'net'), ('EER','eer-far'), ('EER','eer-frr'), ('EER','eer-thr'), ('FAR1%','far1-far'), ('FAR1%','far1-frr'), ('FAR1%','far1-thr'), ('', 'no-trials')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    dfs.append(df)

In [82]:
df = pd.concat(dfs).reset_index()
del df['index']
df.style.set_properties(**{'width':'10em', 'text-align':'center'}).set_table_styles([dict(selector="th", props=[('text-align', 'center')])])

Unnamed: 0_level_0,-,-,EER,EER,EER,FAR1%,FAR1%,FAR1%,Unnamed: 9_level_0
Unnamed: 0_level_1,policy,net,eer-far,eer-frr,eer-thr,far1-far,far1-frr,far1-thr,no-trials
0,raw,resnet34vox/v001,0.1112,0.1108,0.842719,0.01,0.4272,0.897524,37720
1,raw,xvector/v001,0.0848,0.0844,0.828743,0.01,0.242,0.873502,37720
2,raw,resnet50vox/v001,0.0912,0.0916,0.996621,0.01,0.332,0.997906,37720
3,raw,vggvox/v004,0.2824,0.2824,0.975831,0.01,0.8892,0.989723,37720
4,any,resnet34vox/v001,0.268,0.0076,0.842719,0.0404,0.054,0.897524,37720
5,any,xvector/v001,0.2104,0.0072,0.828743,0.0408,0.0236,0.873502,37720
6,any,resnet50vox/v001,0.2764,0.0056,0.996621,0.0476,0.0272,0.997906,37720
7,any,vggvox/v004,0.6552,0.014,0.975831,0.0764,0.5108,0.989723,37720
8,avg,resnet34vox/v001,0.2528,0.0116,0.842719,0.052,0.0636,0.897524,37720
9,avg,xvector/v001,0.2044,0.012,0.828743,0.0524,0.0356,0.873502,37720


#### Benchmark on Gender based on Raw Policy Thresholds

In [89]:
results = {}
for template_dim, policy_name in setups:
    results[policy_name] = {}
    for net in nets:
        results[policy_name][net] = []
        for gender in ['male', 'female']:
            gender_result = vox2_mv_train_results[policy_name][net][vox2_mv_train_results[policy_name][net]['gender']==gender]
            far_eer = count_far(gender_result['target'].values, gender_result['score'].values, ref_thrs[net]['eer'])
            frr_eer = count_frr(gender_result['target'].values, gender_result['score'].values, ref_thrs[net]['eer'])
            far_far1 = count_far(gender_result['target'].values, gender_result['score'].values, ref_thrs[net]['far1'])
            frr_far1 = count_frr(gender_result['target'].values, gender_result['score'].values, ref_thrs[net]['far1'])
            results[policy_name][net] += [format_val(far_eer), format_val(frr_eer), format_val(far_far1), format_val(frr_far1)]
        results[policy_name][net] = [policy_name] + [net] + results[policy_name][net]

In [90]:
dfs = []
for template_dim, policy_name in setups:
    df = pd.DataFrame.from_dict(results[policy_name], orient='index', columns=['policy', 'net', 'm-eer-far', 'm-eer-frr', 'm-far1-far', 'm-far1-frr', 'f-eer-far', 'f-eer-frr', 'f-far1-far', 'f-far1-frr']) 
    columns = [('-', 'policy'), ('-', 'net'), ('MALE','m-eer-far'), ('MALE','m-eer-frr'), ('MALE','m-far1-far'), ('MALE','m-far1-frr'), 
               ('FEMALE','f-eer-far'), ('FEMALE','f-eer-frr'), ('FEMALE','f-far1-far'), ('FEMALE','f-far1-frr')]
    df.columns = pd.MultiIndex.from_tuples(columns)
    dfs.append(df)

In [91]:
df = pd.concat(dfs).reset_index()
del df['index']
df.style.set_properties(**{'width':'10em', 'text-align':'center'}).set_table_styles([dict(selector="th", props=[('text-align', 'center')])])

Unnamed: 0_level_0,-,-,MALE,MALE,MALE,MALE,FEMALE,FEMALE,FEMALE,FEMALE
Unnamed: 0_level_1,policy,net,m-eer-far,m-eer-frr,m-far1-far,m-far1-frr,f-eer-far,f-eer-frr,f-far1-far,f-far1-frr
0,raw,resnet34vox/v001,0.052464,0.147854,0.00318,0.502385,0.170692,0.073269,0.016908,0.351047
1,raw,xvector/v001,0.056864,0.082047,0.004874,0.250203,0.111899,0.086682,0.014972,0.234043
2,raw,resnet50vox/v001,0.062549,0.102424,0.003909,0.349492,0.121212,0.080262,0.01638,0.313677
3,raw,vggvox/v004,0.223263,0.322404,0.007026,0.913349,0.344545,0.240361,0.013126,0.863823
4,any,resnet34vox/v001,0.180655,0.010392,0.008793,0.071942,0.355484,0.004804,0.072058,0.036029
5,any,xvector/v001,0.156375,0.007217,0.018444,0.019246,0.264166,0.007183,0.063049,0.027933
6,any,resnet50vox/v001,0.223567,0.008071,0.020178,0.029056,0.328311,0.003172,0.074544,0.025377
7,any,vggvox/v004,0.582665,0.021669,0.040128,0.568218,0.727273,0.00638,0.11244,0.453748
8,avg,resnet34vox/v001,0.1568,0.0168,0.0168,0.0832,0.3488,0.0064,0.0872,0.044
9,avg,xvector/v001,0.152597,0.017045,0.032468,0.040584,0.254732,0.007098,0.071767,0.030757
