In [1]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import os
import sys
import pandas as pd
import pickle
import numpy as np

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [2]:
from batch_sv_system_utils import get_embeds, cosine_sim, compute_error
from batch_sv_system_utils import compute_eer
from utils import key2df, df2dict

In [3]:
def get_id2idx(keys):
    key_df = key2df(keys)
    id2idx, idx2id = df2dict(key_df) 
    return id2idx

In [4]:
embed_dir = "embeddings/voxc2_fbank64_voxc2untied_embeds/"
# embed_dir = "embeddings/voxc2_fbank64_voxc2untied_300f_embeds/"
sv_embeds = np.load(embed_dir + "/sv_embeds.npy")
keys = pickle.load(open(embed_dir + "/sv_keys.pkl", "rb"))
id2idx = get_id2idx(keys)

plda_embed_dir = "embeddings/voxc2_fbank64_voxc2untied_xvector/"
# plda_embed_dir = "embeddings/voxc2_fbank64_voxc2untied_300f_xvector/"
plda_sv_embeds = np.load(plda_embed_dir + "/sv_embeds.npy")
plda_model_dir = plda_embed_dir + "plda_train/"
plda_keys = pickle.load(open(plda_embed_dir + "/sv_keys.pkl", "rb"))
plda_id2idx = get_id2idx(plda_keys)

In [5]:
def get_features(x, n_feat=5):
    min_ = x.min(0)
    max_ = x.max(0)
    median_ = np.median(x, axis=0)
    avg_ = x.mean(0)
    std_ = x.std(0)
   
    if n_feat==2:
        return np.stack([avg_, std_], axis=0).T
    elif n_feat==3:
        return np.stack([max_, avg_, std_], axis=0).T
    elif n_feat==4:
        return np.stack([min_, max_, avg_, std_], axis=0).T
    elif n_feat==5:
        return np.stack([min_, max_, median_, avg_, std_], axis=0).T

## Supervised Adaptation 

###  Cosine + Normalization

In [86]:
# dev normalized cosine scores
from batch_sv_system_utils import run_trial, plot_score 

cohort_embeds = np.load("trials/enr306/cohort_embeds.npy")
trial = pickle.load(open("trials/enr306/dev_random_full_enr_spk10/trials.pkl", "rb"))

dev_score_list = []
dev_norm_score_list = []
dev_labels = []
for t_idx in range(len(trial)):
    enr_spk, imposters, enr_ids, test_trial = trial[t_idx]

    ### get embeds
    enr_embeds = get_embeds(enr_ids, sv_embeds, id2idx, norm=True)
    test_embeds = get_embeds(test_trial[0], sv_embeds, id2idx, norm=True)


    adapt_fusion, adapt_scores = run_trial(enr_embeds, test_embeds, test_trial[1],
                                           plda_dir=None, neg_embeds=None,
                                           plot=False, title="score_fusion(adapt)",
                                           verbose=False)
    enr_cohort_scores = cosine_sim(enr_embeds, cohort_embeds)
    enr_mu, enr_std = enr_cohort_scores.mean(1, keepdims=True), enr_cohort_scores.std(1, keepdims=True)
    test_cohort_scores = cosine_sim(test_embeds, cohort_embeds)
    test_mu, test_std = test_cohort_scores.mean(1, keepdims=True).T, test_cohort_scores.std(1, keepdims=True).T
    norm_adapt_scores = ((adapt_scores - enr_mu)/enr_std + (adapt_scores - test_mu)/test_std)/2
    dev_score_list.append(adapt_scores)
    dev_norm_score_list.append(norm_adapt_scores)
    dev_labels.append(test_trial[1])

### LR Model Training 

In [None]:
# LR train
from sklearn.linear_model import LogisticRegressionCV

train_feat = np.concatenate([get_features(x, n_feat=3) for x in dev_norm_score_list], axis=0)
train_labels = np.concatenate(dev_labels)[:len(train_feat)]

clf = LogisticRegressionCV(Cs=10, fit_intercept=True, class_weight='balanced')
clf.fit(train_feat, train_labels)

In [91]:
pickle.dump(clf, open("trials/enr306/dev_random_full_enr_spk10/norm_nf3_bal_clf.pkl", "wb"))

### PLDA

In [36]:
# dev PLDA
from batch_sv_system_utils import run_trial, plot_score 

trial = pickle.load(open("trials/dev317_eval934/dev_random_enr20_spk10_gender/trials.pkl", "rb"))

dev_plda_score_list = []
dev_labels = []
for t_idx in range(len(trial)):
    enr_spk, enr_ids, test_trial = trial[t_idx]

    ### get embeds
    enr_embeds = get_embeds(enr_ids, plda_sv_embeds, id2idx, norm=False)
    test_embeds = get_embeds(test_trial[0], plda_sv_embeds, id2idx, norm=False)


    adapt_fusion, adapt_scores = run_trial(enr_embeds, test_embeds, test_trial[1],
                                           plda_dir=plda_model_dir, neg_embeds=None,
                                           plot=False, title="score_fusion(adapt)",
                                           verbose=False)
    dev_plda_score_list.append(adapt_scores)
    dev_labels.append(test_trial[1])

### LR Model Training 

In [40]:
# LR train
from sklearn.linear_model import LogisticRegressionCV

train_feat = np.concatenate([get_features(x, n_feat=5) for x in dev_plda_score_list], axis=0)
train_labels = np.concatenate(dev_labels)

lr_clf = LogisticRegressionCV(Cs=10, fit_intercept=True, class_weight='balanced')
lr_clf.fit(train_feat, train_labels)



LogisticRegressionCV(Cs=10, class_weight='balanced', cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [41]:
from sklearn.svm import SVR, SVC, LinearSVC
svm_clf = LinearSVC(class_weight='balanced')
svm_clf.fit(train_feat, train_labels) 



LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [44]:
lr_clf.coef_

array([[ 0.03717907,  0.25214947, -0.13143247,  0.31050746,  0.20215204]])

In [46]:
svm_clf.coef_

array([[ 0.01989696,  0.02000047, -0.02425224,  0.11643041,  0.07652798]])

In [39]:
pickle.dump(lr_clf, open("trials/dev317_eval934/dev_random_enr20_spk10_gender/plda_nf4_bal_lr_clf.pkl", "wb"))
pickle.dump(svm_clf, open("trials/dev317_eval934/dev_random_enr20_spk10_gender/plda_nf4_bal_svm_clf.pkl", "wb"))

## Unsupervised Adaptation 

### LR Model Training 

In [32]:
# LR train
from sklearn.linear_model import LogisticRegressionCV

train_feat = np.concatenate([get_features(x) for x in dev_norm_score_list], axis=0)
train_labels = np.concatenate(dev_labels)[:len(train_feat)]

clf = LogisticRegressionCV(Cs=10, fit_intercept=True)
clf.fit(train_feat, train_labels)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [33]:
pickle.dump(clf, open("trials/enr306/dev_random_n2000/norm_unsup_clf.pkl", "wb"))