FR_style_acc
------------------------

FR(Face Recognition)에선 validation set을 가지고 threshold를 결정한 다음에 

ACC를 계산한다.

example (https://github.com/wy1iu/sphereface/blob/master/test/code/evaluation.m)

```
function bestThreshold = getThreshold(scores, flags, thrNum)
    accuracys  = zeros(2*thrNum+1, 1);
    thresholds = (-thrNum:thrNum) / thrNum;
    for i = 1:2*thrNum+1
        accuracys(i) = getAccuracy(scores, flags, thresholds(i));
    end
    bestThreshold = mean(thresholds(accuracys==max(accuracys)));
end

function accuracy = getAccuracy(scores, flags, threshold)
    accuracy = (length(find(scores(flags==1)>threshold)) + ...
                length(find(scores(flags~=1)<threshold))) / length(scores);
end
```

>  threshold = getThreshold(scores(valFold), flags(valFold), 10000);

>  ACCs(i)   = getAccuracy(scores(testFold), flags(testFold), threshold);

### Environment

In [1]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import pandas as pd
import pickle
import numpy as np
import sys
import os

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [2]:
sys.path.append('/host/projects/sv_experiments/')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [76]:
trial = pd.read_pickle("/dataset/SV_sets/dataframes/voxc1/voxc_trial.pkl")

In [4]:
si_keys = pickle.load(open("../../embeddings/voxc12/xvectors/xvectors_tdnn6b/train_feat/key.pkl", "rb"))
si_embeds = np.load("../../embeddings/voxc12/xvectors/xvectors_tdnn6b/train_feat/feat.npy")

sv_keys = pickle.load(open("../../embeddings/voxc12/xvectors/xvectors_tdnn6b/test_feat/key.pkl", "rb"))
sv_embeds = np.load("../../embeddings/voxc12/xvectors/xvectors_tdnn6b/test_feat/feat.npy")

In [6]:
def key2df(keys):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split("-")[0])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['origin'] = key_df.spk.apply(lambda x: 'voxc2' if x.startswith('id') else 'voxc1')
    
    return key_df

In [7]:
si_key_df = key2df(si_keys)
sv_key_df = key2df(sv_keys)

### New trial

In [8]:
sv_key_df['session'] = sv_key_df.key.apply(lambda x: x.split('-')[1])

In [9]:
new_trial_ids = []
new_trial_keys = []
for idx, row in sv_key_df.iterrows():
    pos_pair = sv_key_df[(sv_key_df.spk == row.spk) & (sv_key_df.session != row.session)].sample(n=4)
    neg_pair = sv_key_df[(sv_key_df.spk != row.spk)].sample(n=4)
    
    for pos_idx, neg_idx in zip(pos_pair.index.tolist(), neg_pair.index.tolist()):
        new_trial_ids += [(idx, pos_idx, 1), (idx, neg_idx, 0)]
    
    for pos_key, neg_key in zip(pos_pair.key, neg_pair.key):
        new_trial_keys += [(row.key, pos_key, 'target'), (row.key, neg_key, 'nontarget')]

In [10]:
new_trial = pd.DataFrame(new_trial_ids, columns=['enrolment_id', 'test_id', 'label'])

In [11]:
new_trial_key = pd.DataFrame(new_trial_keys)

In [12]:
new_trial_key.to_csv("voxc12_trial_sv", sep=' ', index=False, header=False)

### Test EER for new trials

In [14]:
import torch
import torch.nn.functional as F

sv_embed_tensor = torch.from_numpy(sv_embeds)
sim_matrix = F.cosine_similarity(                                                                                                                                                                                                                                    
     sv_embed_tensor.unsqueeze(1), sv_embed_tensor.unsqueeze(0), dim=2)       

In [15]:
from sklearn.metrics import roc_curve

cord = [new_trial.enrolment_id.tolist(), new_trial.test_id.tolist()]                                                                                                                                                                                                         
score_vector = sim_matrix[cord].detach().numpy()                                                                                                                                                                                                                              
label_vector = np.array(new_trial.label)                                                                                                                                                                                                                                 
fpr, tpr, thres = roc_curve(                                                                                                                                                                                                                                         
     label_vector, score_vector, pos_label=1)                                                                                                                                                                                                                     
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]

In [16]:
eer

0.09483996717275339

### Compute the best ACC at the optimal threshold

In [90]:
n_pos = sum(label_vector)
n_neg = len(label_vector) - n_pos
n_correct = tpr * n_pos + tnr * n_neg
acc = n_correct / len(label_vector)

ValueError: operands could not be broadcast together with shapes (2767,) (5974,) 

In [91]:
max(acc)

0.9078016003282725

### Read plda scores

In [115]:
plda_score = pd.read_csv("../../embeddings/voxc12/dda_xvector1/plda_scores/plda_scores", sep=' ', header=None)
plda_score.columns = ['enroll', 'test', 'score']

In [116]:
voxceleb1_trials_sv = pd.read_pickle("/dataset/SV_sets/dataframes/voxc1/voxc_trial_modified.pkl")

In [117]:
score_vector = plda_score.score.tolist()

In [118]:
from sklearn.metrics import roc_curve

cord = [trial.enrolment_id.tolist(), trial.test_id.tolist()]                                                                                                                                                                                                         
label_vector = np.array(voxceleb1_trials_sv.label)                                                                                                                                                                                                                                 
fpr, tpr, thres = roc_curve(                                                                                                                                                                                                                                         
     label_vector, score_vector, pos_label=1)     

eer = round(fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))] * 100, 2)

In [119]:
n_pos = sum(label_vector)
n_neg = len(label_vector) - n_pos
tnr = 1 - fpr
n_correct = tpr * n_pos + tnr * n_neg
acc = n_correct / len(label_vector)

In [120]:
print(f"plda acc at the optimal threshold: {max(acc)}")

plda acc at the optimal threshold: 0.8770148462354189
