score_investigation-validation_eer
------------------------

validation set을 가지고 threshold를 정했을 떄의 성능



## Environment

In [1]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import pandas as pd
import pickle
import numpy as np
import sys
import os


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [3]:
sys.path.append('/host/projects/sv_experiments/sv_system/')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [4]:
def key2df(keys, delimeter="-"):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split(delimeter)[0])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['origin'] = key_df.spk.apply(lambda x: 'voxc2' if x.startswith('id') else 'voxc1')
    
    return key_df

## Scores

In [5]:
import torch
from torch.nn.functional import cosine_similarity

In [6]:
def read_score(score_path):
    score = pd.read_csv(score_path, 
                    sep=' ', header=None)
    score.columns = ['enroll', 'test', 'score']
    
    return score

In [13]:
shared_cosine_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/shared_scores/cosine_scores")
shared_lda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/shared_scores/lda_scores")
shared_plda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/shared_scores/plda_scores")

In [14]:
all_cosine_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/all_scores/cosine_scores")
all_lda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/all_scores/lda_scores")
all_plda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/all_scores/plda_scores")

In [15]:
hard_cosine_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/hard_scores/cosine_scores")
hard_lda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/hard_scores/lda_scores")
hard_plda_score = read_score(
    "/host/projects/sv_experiments/sv_system/embeddings/voxc2/voxc2_mfcc30_best/hard_scores/plda_scores")

In [16]:
voxc2_shared_trial = pd.read_pickle("/dataset/SV_sets/voxceleb12/dataframes/voxc12_test_trial.pkl")
voxc2_all_trial = pd.read_pickle("/dataset/SV_sets/voxceleb12/dataframes/voxc2_trials/voxc2_all_trials.pkl")
voxc2_hard_trial = pd.read_pickle("/dataset/SV_sets/voxceleb12/dataframes/voxc2_trials/voxc2_hard_trials.pkl")

In [18]:
voxc2_shared_trial['cosine_score'] = shared_cosine_score.score
voxc2_shared_trial['lda_score'] = shared_lda_score.score
voxc2_shared_trial['plda_score'] = shared_plda_score.score

In [19]:
voxc2_hard_trial['cosine_score'] = hard_cosine_score.score
voxc2_hard_trial['lda_score'] = hard_lda_score.score
voxc2_hard_trial['plda_score'] = hard_plda_score.score

In [20]:
voxc2_all_trial['cosine_score'] = all_cosine_score.score
voxc2_all_trial['lda_score'] = all_lda_score.score
voxc2_all_trial['plda_score'] = all_plda_score.score

### voxc2_trial EERs

In [28]:
voxc2_shared_pos_score = voxc2_shared_trial[voxc2_shared_trial.label==1].plda_score
voxc2_shared_neg_score = voxc2_shared_trial[voxc2_shared_trial.label==0].plda_score

from eval.score_utils import compute_eer
eer, shared_thres = compute_eer(voxc2_shared_pos_score, voxc2_shared_neg_score)

eer:4.311% at threshold -1.3987


In [27]:
voxc2_hard_pos_score = voxc2_hard_trial[voxc2_hard_trial.label==1].plda_score
voxc2_hard_neg_score = voxc2_hard_trial[voxc2_hard_trial.label==0].plda_score

from eval.score_utils import compute_eer
eer, hard_thres = compute_eer(voxc2_hard_pos_score, voxc2_hard_neg_score)

eer:6.697% at threshold 0.1983


In [26]:
voxc2_all_pos_score = voxc2_all_trial[voxc2_all_trial.label==1].plda_score
voxc2_all_neg_score = voxc2_all_trial[voxc2_all_trial.label==0].plda_score

from eval.score_utils import compute_eer
eer, all_thres = compute_eer(voxc2_all_pos_score, voxc2_all_neg_score)

eer:4.206% at threshold -1.7356


### EER with custom threshold

In [76]:
clf_result = voxc2_shared_trial.plda_score.apply(lambda x: 1 if x > shared_thres else 0)
shared_acc = np.count_nonzero(clf_result == voxc2_shared_trial.label) / len(voxc2_shared_trial)

In [77]:
clf_result = voxc2_shared_trial.plda_score.apply(lambda x: 1 if x > hard_thres else 0)
hard_acc = np.count_nonzero(clf_result == voxc2_shared_trial.label) / len(voxc2_shared_trial)

In [78]:
clf_result = voxc2_shared_trial.plda_score.apply(lambda x: 1 if x > all_thres else 0)
all_acc = np.count_nonzero(clf_result == voxc2_shared_trial.label) / len(voxc2_shared_trial)

In [79]:
print(f"shared_thresh {shared_acc}")
print(f"hard_thresh {hard_acc}")
print(f"all_thresh {all_acc}")

shared_thresh 0.9569194061505832
hard_thresh 0.9530487804878048
all_thresh 0.9570254506892895


### incorrect trials

In [32]:
voxc1_bad_pos_trial = voxc1_trial[(voxc1_trial.label==1) & (voxc1_trial.plda_score < voxc1_thres)]
voxc1_bad_neg_trial = voxc1_trial[(voxc1_trial.label==0) & (voxc1_trial.plda_score > voxc1_thres)]
voxc1_bad_trial = pd.concat([voxc1_bad_pos_trial, voxc1_bad_neg_trial])

In [63]:
voxc1_trial['result'] = 1
voxc1_trial.loc[(voxc1_trial.label==1) & (voxc1_trial.plda_score < voxc1_thres), 'result'] = 0