# py_plda_baseline

py_plda model을 사용하는 baseline 실험들

In [197]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import os
import sys
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [202]:
from batch_sv_system import get_embeds, cosine_sim
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_fscore_support

In [199]:
from utils import key2df, df2dict, compute_eer, get_id2idx

embed_dir = "embeddings/voxc2_fbank64_voxc2untied_xvector/"
sv_embeds = np.load(embed_dir+"ln_lda_sv_embeds.npy")
sv_keys = pickle.load(open(embed_dir + "/sv_keys.pkl", "rb"))
sv_id2idx = get_id2idx(sv_keys)

cohort_ids = np.load("trials/dev940_eval311/dev_cohort_ids.npy")
cohort_embeds = get_embeds(cohort_ids, sv_embeds, sv_id2idx, norm=False)

In [205]:
from ioffe_plda.verifier import Verifier
py_plda_model = Verifier()
py_plda_model = Verifier(pickle.load(open("py_plda_model_ln_lda.pkl", "rb")))

In [208]:
def score_trials(enr_embeds, test_embeds, score_method):
        if score_method == "scoreAvg":
            score = py_plda_model.score_avg(enr_embeds, test_embeds).mean(0)
        elif score_method == "vectorAvg":
            score = py_plda_model.vector_avg(enr_embeds, test_embeds).mean(0)
        elif score_method == "multiSessScale":
            score = py_plda_model.multi_sess(enr_embeds, test_embeds, cov_scaling=True).mean(0)
        elif score_method == "multiSessAdapt":
            score = py_plda_model.multi_sess(enr_embeds, test_embeds, cov_adapt=True).mean(0)
        else:
            raise NotImplemtedError
        
        return score

## Split Trials Type 1

In [209]:
adapt_trials = pickle.load(open("trials/dev940_eval311/split_trials/adapt_enr1_hard_trials.pkl", "rb"))
test_trials = pickle.load(open("trials/dev940_eval311/split_trials/test_semihard_trials.pkl", "rb"))

In [None]:
eT_list =  [-4.26017, 3.1, 10.317]
init = []
optimal = []

scoreAvg = {k:[] for k in eT_list}
vectorAvg = {k:[] for k in eT_list}
multiSessScale = {k:[] for k in eT_list}
multiSessAdapt = {k:[] for k in eT_list}

adapt_preds = {k:[] for k in eT_list}

labels = []
adapt_labels = []

for t_i in tqdm(range(0, len(adapt_trials), 1), total=len(adapt_trials)//1):
    trial_info, enr_ids, adapt_trial = adapt_trials[t_i]
    test_trial = test_trials[trial_info['spk']]
    adapt_trial = (np.array(adapt_trial.id), np.array(adapt_trial.label))
    test_trial = (np.array(test_trial.id), np.array(test_trial.label))

    init_enr_embeds = get_embeds(enr_ids, sv_embeds,  sv_id2idx, norm=False)
    adapt_embeds = get_embeds(adapt_trial[0], sv_embeds, sv_id2idx, norm=False)
    test_embeds = get_embeds(test_trial[0], sv_embeds, sv_id2idx, norm=False)

    optimal_enr_embeds = np.concatenate([init_enr_embeds, adapt_embeds[adapt_trial[1]==1]])
    init.append(score_trials(init_enr_embeds, test_embeds, "multiSessScale"))
    optimal.append(score_trials(optimal_enr_embeds, test_embeds, "multiSessScale"))
    
    adapt_scores = score_trials(init_enr_embeds, adapt_embeds, "multiSessScale")
    
    for eT in eT_list:
        adapted_embeds = adapt_embeds[adapt_scores > eT]
        adapt_preds[eT].append(adapt_scores > eT)
        total_enr_embeds = np.concatenate([init_enr_embeds, adapted_embeds])
        
        multiSessScale[eT].append(score_trials(total_enr_embeds, test_embeds, "multiSessScale"))
#         scoreAvg[eT].append(score_trials(total_enr_embeds, test_embeds, "scoreAvg"))
#         vectorAvg[eT].append(score_trials(total_enr_embeds, test_embeds, "vectorAvg"))
#         multiSessAdapt[eT].append(score_trials(total_enr_embeds, test_embeds, "multiSessAdapt"))
        
    adapt_labels.append(adapt_trial[1])
    labels.append(test_trial[1])

In [218]:
print("init")
compute_eer(np.concatenate(init), np.concatenate(labels))
print("optimal")
compute_eer(np.concatenate(optimal), np.concatenate(labels))
print()
for eT in eT_list:
    print("eT: {}".format(eT))
    print("multiSessScale")
    compute_eer(np.concatenate(multiSessScale[eT]), np.concatenate(labels))
#     print("scoreAvg")
#     compute_eer(np.concatenate(scoreAvg[eT]), np.concatenate(labels))
#     print("vectorAvg")
#     compute_eer(np.concatenate(vectorAvg[eT]), np.concatenate(labels));
#     print("multiSessAdapt")
#     compute_eer(np.concatenate(multiSessAdapt[eT]), np.concatenate(labels))

    precision, recall, _, _ = precision_recall_fscore_support(
        np.concatenate(adapt_labels), np.concatenate(adapt_preds[eT]), average='binary')
    print("precision:{:.3f}, recall:{:.3f}".format(precision, recall))
    print()

init
eer: 1.23%, fpr: 1.23%, fnr: 1.23%
optimal
eer: 0.67%, fpr: 0.67%, fnr: 0.67%

eT: -4.26017
multiSessScale
eer: 2.77%, fpr: 2.72%, fnr: 2.77%
precision:0.223, recall:1.000

eT: 3.1
multiSessScale
eer: 1.23%, fpr: 1.13%, fnr: 1.23%
precision:0.615, recall:0.980

eT: 10.317
multiSessScale
eer: 1.13%, fpr: 1.13%, fnr: 1.13%
precision:0.972, recall:0.940



In [219]:
init_eers = {k:[] for k in eT_list}
opt_eers = {k:[] for k in eT_list} 
multiSessScale_eers = {k:[] for k in eT_list}
for eT in eT_list:
    print("eT: {}".format(eT))
    for init_score, optimal_score, multi_score, label in zip(init, optimal, multiSessScale[eT], labels):
        init_eers[eT].append(compute_eer(init_score, label, verbose=False)[0])
        opt_eers[eT].append(compute_eer(optimal_score, label, verbose=False)[0])
        multiSessScale_eers[eT].append(compute_eer(multi_score, label, verbose=False)[0])
    compare_value(init_eers[eT], multiSessScale_eers[eT])

eT: -4.26017
inc:0.67, equal:0.00, dec:0.33
eT: 3.1
inc:0.33, equal:0.00, dec:0.67
eT: 10.317
inc:0.33, equal:0.33, dec:0.33


In [220]:
# individual prec and recall
for labels, preds in zip(adapt_labels, adapt_preds[eT]):
    precision, recall, _, _ = precision_recall_fscore_support(labels, preds, average='binary')
    print("eT:{}, precision:{:.3f}, recall:{:.3f}".format(eT, precision, recall))

eT:10.317, precision:0.933, recall:0.840
eT:10.317, precision:0.980, recall:0.980
eT:10.317, precision:1.000, recall:1.000


## Split Trials Type 2

In [223]:
trials = pickle.load(open("trials/dev940_eval311/hard_enr3xsess_ntar9/trials.pkl", "rb"))

In [227]:
adapt_ratio = 0.2

eT_list =  [-4.26017]
init = []
optimal = []
labels = []
scoreAvg = {k:[] for k in eT_list}
vectorAvg = {k:[] for k in eT_list}
multiSessScale = {k:[] for k in eT_list}
multiSessAdapt = {k:[] for k in eT_list}

for t_i in tqdm(range(len(trials)), total=len(trials)):
    enr_spk, enr_ids, test_trial = trials[t_i]
    test_trial = (np.array(test_trial.id), np.array(test_trial.label))

    adapt_len = int(len(test_trial[0]) * adapt_ratio)
    adapt_trial = (test_trial[0][:adapt_len], test_trial[1][:adapt_len])
    test_trial = (test_trial[0][adapt_len:], test_trial[1][adapt_len:])
    
    init_enr_embeds = get_embeds(enr_ids, sv_embeds,  sv_id2idx, norm=False)
    adapt_embeds = get_embeds(adapt_trial[0], sv_embeds, sv_id2idx, norm=False)
    test_embeds = get_embeds(test_trial[0], sv_embeds, sv_id2idx, norm=False)

    ## optimal, init scores
    optimal_enr_embeds = np.concatenate([init_enr_embeds, 
                                         adapt_embeds[adapt_trial[1]==1]])
    init.append(score_trials(init_enr_embeds, test_embeds, "multiSessScale"))
    optimal.append(score_trials(optimal_enr_embeds, test_embeds, "multiSessScale"))
   
    ## adapt scores
    adapt_scores = score_trials(total_enr_embeds, adapt_embeds, "scoreAvg")
    
    for eT in eT_list:
        adapted_embeds = adapt_embeds[adapt_scores > eT]
        total_enr_embeds = np.concatenate([init_enr_embeds, adapted_embeds])
        
        scoreAvg[eT].append(score_trials(total_enr_embeds, test_embeds, "scoreAvg"))
        vectorAvg[eT].append(score_trials(total_enr_embeds, test_embeds, "vectorAvg"))
        multiSessScale[eT].append(score_trials(total_enr_embeds, test_embeds, "multiSessScale"))
        multiSessAdapt[eT].append(score_trials(total_enr_embeds, test_embeds, "multiSessAdapt"))
        
    labels.append(test_trial[1])
    break

  0%|          | 0/7362 [00:00<?, ?it/s]


In [None]:
for eT in eT_list:
    print("optimal")
    compute_eer(np.concatenate(optimal), np.concatenate(labels))
    print("init")
    compute_eer(np.concatenate(init), np.concatenate(labels))
    print("scoreAvg")
    compute_eer(np.concatenate(scoreAvg[eT]), np.concatenate(labels))
    print("vectorAvg")
    compute_eer(np.concatenate(vectorAvg[eT]), np.concatenate(labels));
    print("multiSessScale")
    compute_eer(np.concatenate(multiSessScale[eT]), np.concatenate(labels))
    print("multiSessAdapt")
    compute_eer(np.concatenate(multiSessAdapt[eT]), np.concatenate(labels))