# Benchmarking Speaker Models

This notebook allows you to test pre-trained speaker models in terms fo equal error rate and impersonation rate against master voices.

In [None]:
from IPython.display import display, HTML
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np
import pickle
import os

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib

In [None]:
pd.options.display.float_format = '{:,.4f}'.format

### Parameters

- **test_set**: id of the dataset against which equal error rate is computed ('vox1' or 'vox2'); please refer to speaker model testing. 
- **nets** ['{net1}/{vxxx}', '{net2}/{vxxx}', ..., '{netn}/{vxxx}']: comma-separated list of speaker models to test
- **tars** [None, 1.0, 0.1]: comma-separated list of false acceptance levels to test (None stands for EER level; other common values are 1.0 and 0.1)
- **pols** ['avg', 'any']: comma-separated list of verification policies to test
- **thrs_types** [None, 'avg', 'any']: comma-separated list of thresholds to against which master voice impersonation is tested (None stands for raw 1-1 comparisons)  

In [None]:
test_set = 'vox1'

In [None]:
nets = ['vggvox/v003','resnet50/v003']

In [None]:
tars = [None, 1.0]

In [None]:
pols = ['avg', 'any']

In [None]:
thrs_types = [None]

### Speaker Model Performance

Given a list of similarity scores and verification labels (0:diff-user pair, 1:same-user pair), this function computes the verification threshold, FAR, and FRR at a given target_fa false acceptance level. If target_fa=None, this function computes threshold, FAR, and FAR at the equal error rate.

In [None]:
def tuneThreshold(scores, labels, target_fa=None):
    far, tpr, thresholds = roc_curve(labels, scores, pos_label=1)
    frr = 1 - tpr
    frr = frr*100
    far = far*100
    if target_fa:
        idx = np.nanargmin(np.absolute((target_fa - far))) 
        return thresholds[idx], far[idx], frr[idx]
    idxE = np.nanargmin(np.absolute((frr - far)))
    eer  = max(far[idxE], frr[idxE])
    return thresholds[idxE], far[idxE], frr[idxE]

We define to functions in order to compute false acceptance and false rejection rates at a given verification threshold.

In [None]:
def count_far(labels, scores, thr):
    fars = 0
    count = 0
    for t, s in zip(labels, scores):
        if t == 0:
            if s >= thr:
                fars += 1
            count += 1
    return fars / count * 100

In [None]:
def count_frr(labels, scores, thr):
    frrs = 0
    count = 0
    for t, s in zip(labels, scores):
        if t == 1:
            if s < thr:
                frrs += 1
            count += 1
    return frrs / count * 100

We read the csv files computed under the speaker model testing procedure, for all the specified speaker models. Each file includes verification labels and corresponding similarity scores. 

In [None]:
vox1_test_results = {}
for net in nets:
    vox1_test_results[net] = pd.read_csv(os.path.join('../data/vs_mv_models', net, 'scores_' + test_set + '_test.csv'))
    vox1_test_results[net] = vox1_test_results[net].loc[:, ~vox1_test_results[net].columns.str.contains('^Unnamed')]
    vox1_test_results[net].columns = ['label', 'score']

For each speaker model, we also retrieve the training history (e.g., loss, accuracy). 

In [None]:
train_history_results = {}
for net in nets:
    train_history_results[net] = pd.read_csv(os.path.join('../data/vs_mv_models', net, 'history.csv'))
    train_history_results[net] = train_history_results[net].loc[:, ~train_history_results[net].columns.str.contains('^Unnamed')]

For each speaker model and each user within the corresponding list of trial pairs, we compute the maximum similarity score for same-user pairs and different-user pairs. 

In [None]:
def groupScores(scores, labels, size=8): # By default, 8 trial pairs per user are included into the vox1_test_results CSVs
    if thrs_type is None:
        return scores, labels
    grp_scores, grp_labels = [], []
    for i in range(0, len(scores), size): # For all the trial pairs of a user
        curr_scores = scores[i:i+size] # We retrieve the similarity scores for his/her trial pairs 
        grp_scores.append(np.max(curr_scores[1::2])) # We get the maximum similarity score for different-user trial pairs
        grp_labels.append(0)
        grp_scores.append(np.max(curr_scores[0::2])) # We get the maximum similarity score for same-user trial pairs
        grp_labels.append(1)
    return grp_scores, grp_labels

The following snippet creates and populates to dictionaries:
- **ress** will include the performance in terms of speaker recognition for each thrs_type, tar, and net (mean far-frr, far, frr, thr, no_trials), loss, acc]);
- **thrs** will include the thresholds for each thrs_type, tar, net (threshold value in [-1, 1]). 

In [None]:
ress = {} 
thrs = {}
for thrs_type in thrs_types: # For all the threshold types, e.g., [None, 1.0, 0.1]
    ress[thrs_type] = {}
    thrs[thrs_type] = {}
    for tar in tars: # For all the false acceptance rates, e.g., [None, 1.0, 0.1] 
        ress[thrs_type][tar] = {}
        thrs[thrs_type][tar] = {}
        for net in nets: # For all the speaker models to test
            loss = train_history_results[net]['loss'].values[-1] # We get the training loss for the last epoch
            acc = train_history_results[net]['acc'].values[-1] # We get the training accuracy for the last epoch
            if thrs_type is None: # If we want to use the threshold at EER
                thr, far, frr = tuneThreshold(vox1_test_results[net]['score'].values, vox1_test_results[net]['label'].values, tar)
                thrs[thrs_type][tar][net] = thr
                ress[thrs_type][tar][net] = [np.mean([far, frr]), far, frr, thr, len(vox1_test_results[net].index), loss, acc]
            else: # If we want to use a threshold different from the one at EER
                grp_scores, grp_labels = groupScores(vox1_test_results[net]['score'].values, vox1_test_results[net]['label'].values, thrs_type)
                thr = thrs[None][tar][net]
                far = count_far(grp_labels, grp_scores, thr)
                frr = count_frr(grp_labels, grp_scores, thr)
                ress[thrs_type][tar][net] = [np.mean([far, frr]), far, frr, thr, len(vox1_test_results[net].index), loss, acc]

We print the speaker recognition performance for each speaker model at different thrs_types and false acceptance levels.

In [None]:
for thrs_type in thrs_types:
    for tar in tars:
        tar_label = (thrs_type if thrs_type is not None else 'raw') + '  ' + ('FAR@'+str(tar) if tar is not None else 'EER')
        df = pd.DataFrame.from_dict(ress[thrs_type][tar], orient='index', columns=['eer', 'far', 'frr', 'thr', 'no-trials', 'loss', 'acc'])
        df.columns = pd.MultiIndex.from_tuples([(tar_label,'eer'), (tar_label,'far'), (tar_label,'frr'), (tar_label, 'thr'), (tar_label, 'no-trials'), (tar_label, 'loss'), (tar_label, 'acc')])
        df.style.set_properties(**{'width':'10em', 'text-align':'center'})
        df.sort_index(inplace=True)
        display(HTML(df.to_html()))
        print()

### Speaker Impersonation Performance

The following code will allow you to compute the impersonation rate of all the considered speaker models against master voices.

Given a master voice csv file with similarity scores, paths of compared audio, and user's gender, this function will compute the impersonation rate for male and female users. Specifically, it returns five values:
- **imp_m**: number of male users who have been impersonated
- **imp_f**: number of female users who have been impersonated
- **user_ids_m**: list of male user ids who have been impersonated  
- **user_ids_f**: list of female user ids who have been impersonated
- **tot_m**: number of male users
- **tot_f**: number of female users

In [None]:
def computeImpersonation(mv_csv_file, thr, size=10):
    df = pd.read_csv(mv_csv_file)
    imp_m, tot_m = 0, 0
    imp_f, tot_f = 0, 0
    user_ids_f, user_ids_m = [], []
    for i in range(0, len(df), size) : 
        user_id = i // size
        tot_f += 1 if df.loc[i, 'gender'] == 'f' else 0
        tot_m += 1 if df.loc[i, 'gender'] == 'm' else 0
        imp_r = len([i for i in df.loc[i:i+size-1, 'score'] if i >= thr]) 
        imp_f += 1 if df.loc[i, 'gender'] == 'f' and imp_r > 0 else 0
        imp_m += 1 if df.loc[i, 'gender'] == 'm' and imp_r > 0 else 0
        user_ids_f += [user_id] if df.loc[i, 'gender'] == 'f' and imp_r > 0 else []
        user_ids_m += [user_id] if df.loc[i, 'gender'] == 'm' and imp_r > 0 else []
    assert imp_m / tot_m <= 1.0 and imp_f / tot_f <= 1.0
    return imp_m, imp_f, user_ids_m, user_ids_f, tot_m, tot_f

The following snippet will create a dictionary with he impersonation results for different policies, false acceptance levels, speaker models, master voice sets, and master voice files. Specifically, the resulting mv_test_results will include the imp_m, imp_f, user_ids_m, user_ids_f, tot_m, tot_f for each master voice file. See what the function computeImpersonation returns. 

In [None]:
max_no_samples_per_mvset = 10
no_templates_per_user = 10

In [None]:
mv_test_results = {}
for i1, pol in enumerate(pols): # For each verification policy
    mv_test_results[pol] = {}
    for i2, tar in enumerate(tars): # For each false acceptance level
        mv_test_results[pol][tar] = {}
        for i3, net in enumerate(nets): # For each speaker model
            mv_test_results[pol][tar][net] = {}
            dp = os.path.join('../data/vs_mv_models', net, 'mvcmp_' + pol)
            for i4, mvset in enumerate(os.listdir(os.path.join(dp))): # For each master voice set
                mv_test_results[pol][tar][net][mvset] = {}
                for i5, version in enumerate(os.listdir(os.path.join(dp, mvset))): # For each version of a master voice set
                    mv_test_results[pol][tar][net][mvset][version] = {}
                    for mvsam in os.listdir(os.path.join(dp, mvset, version)): # For each master voice file in the current set
                        if int(mvsam.split('.')[0].split('_')[-1]) <= max_no_samples_per_mvset: # We get only max_no_samples_per_mvset at maximum 
                            # We compute the impersonation results for the current master voice file mvsam (if the policy is 'any', we need to combine no_templates_per_user similarity scores)
                            score_group_size = 1 if pol == 'avg' else no_templates_per_user
                            mv_test_results[pol][tar][net][mvset][version][mvsam] = computeImpersonation(os.path.join(dp, mvset, version, mvsam), thrs[None][tar][net], score_group_size) 
                            print('>\r', pol, '(' + str(i1+1) + '/' + str(len(pols)) + ')', tar, '(' + str(i2+1) + '/' + str(len(tars)) + ')', 
                                         net, '(' + str(i3+1) + '/' + str(len(nets)) + ')',  mvset, '('+str(i4+1)+'/'+str(len(os.listdir(dp))) +')',  
                                         version, '('+str(i5+1)+'/'+str(len(os.listdir(os.path.join(dp, mvset)))) +')', end='')

The following function computes the percentage of male and female users impersonated by a given set of master voice sets within a specific speaker model.   

In [None]:
def arrangeData(data, ress, no_verification_trials=1):
    for mvset, mvversions in data.items(): # For each master voice set 
        for mvversion, mvsamps in mvversions.items(): # For each version of the current master voice set 
            imp_m = []
            imp_f = []
            if no_verification_trials <= 1: # If we have only one verification trial, we consider all the master voice files separately
                for mvsam, mvress in mvsamps.items(): # For each master voice file, we compute the percentage of male and female users who have been impersonated
                    imp_m.append(mvress[0] / mvress[4]) # Percentage of male users impersonated by the current master voice file mvsam
                    imp_f.append(mvress[1] / mvress[5]) # Percentage of female users impersonated by the current master voice file mvsam
            else: # If we have more than one verification trial, we consider the master voice files in the current set as a group
                keys = list(mvsamps.keys()) # We get and sort all the master voice filenames, first we try with the master voice file at keys[0], then at keys[1], and so on
                keys.sort()
                tot_m, tot_f = 0, 0
                for t in range(min(no_verification_trials, len(keys))): # For each verification trial we have
                    imp_m += mvsamps[keys[t]][2] # We add to the list of impersonated male user IDs those IDs that have been impersonated by keys[t] master voice file
                    imp_f += mvsamps[keys[t]][3] # We add to the list of impersonated female user IDs those IDs that have been impersonated by keys[t] master voice file
                    tot_m = mvsamps[keys[t]][4] # We keep trace of the total number of male users
                    tot_f = mvsamps[keys[t]][5] # We keep trace of the total number of female users
                imp_m = [len(set(imp_m)) / tot_m] # We compute the percentage of male users who have been impersonated after the no_verification_trials trials 
                imp_f = [len(set(imp_f)) / tot_f] # We compute the percentage of female users who have been impersonated after the no_verification_trials trials 
            if mvset + '-' + mvversion not in ress: # If this is the first time we see this master voice set and version, we initialize the row
                ress[mvset + '-' + mvversion] = [round(np.mean(imp_m)*100,2), round(np.mean(imp_f)*100,2)]
            else: # We append to the wrow of this master voice set and version, the impersonation rates achieved against the current speaker model 
                ress[mvset + '-' + mvversion] += [round(np.mean(imp_m)*100,2), round(np.mean(imp_f)*100,2)]
    return ress

For each policy and falce acceptance level, we show the impersonation results of all the master voice sets against all the considered speaker models. 

In [None]:
no_allowed_verification_trials = 1

In [None]:
for pol in pols: # For all the verification policies
    for tar in tars: # For all the false acceptance levels
        rows = {}
        cols = []
        for net in nets: # For all the speaker models 
            rows = arrangeData(mv_test_results[pol][tar][net], rows, no_allowed_verification_trials) # We add imp rates of the current mv set against the current speaker model 
            cols += [net + '-m', net + '-f'] # We add two new columns associated to the male/female impersonation rates for the current speaker model 
        tar_label = pol.upper() + '  ' + ('FAR@'+str(tar) if tar is not None else 'EER') # We define the table title 
        df = pd.DataFrame.from_dict(rows, orient='index', columns=cols) # We create a dataframe starting from the rows and cols we initialized
        df = df.mask(df==0).fillna('-') # To improve readability, we replace 0% impersonation rates with '-'
        df.sort_index(inplace=True) # We sort the master voice sets alphetically
        df.columns = pd.MultiIndex.from_tuples([(tar_label, col.split('-')[0], col.split('-')[1]) for col in cols])
        df.style.set_properties(**{'width':'10em', 'text-align':'center'})
        display(HTML(df.to_html()))
        print()