In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline
import pandas as pd
import os
import sys
import pickle

Populating the interactive namespace from numpy and matplotlib


In [9]:
import sys
sys.path.append("/home/muncok/DL/projects/")

In [3]:
import torch.nn.functional as F

In [5]:
from sklearn.metrics import roc_curve
def compute_eer(pos_scores, neg_scores):
    score_vector = np.concatenate([pos_scores, neg_scores])
    label_vector = np.concatenate([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
    fpr, tpr, thres = roc_curve(label_vector, score_vector, pos_label=1)
    eer = np.min([fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))], 
                 1-tpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]])
    thres = thres[np.nanargmin(np.abs(fpr - (1 - tpr)))]
    print("eer:{:.3f}, thres:{:.4f}".format(eer*100, thres))

In [11]:
from sv_system.utils.parser import test_config
from sv_system.utils import secToSample, secToFrames
si_config = test_config('res15')
si_config['input_clip'] = True
si_config['input_length'] = secToSample(1)
si_config['input_frames'] = secToFrames(1)
si_config['splice_frames'] = secToFrames(0.1)
si_config['stride_frames'] = secToFrames(0.1)
si_config['input_format'] = 'fbank'

si_config['n_epochs'] = 50
si_config['print_step'] = 100
si_config['lr'] = [0.001, 0.0001]
si_config['schedule'] = [np.inf]
si_config['s_epoch'] = 0

si_config['batch_size'] = 64
si_config['num_workers'] = 48

### SI_Model

In [41]:
from sv_system.model.SpeechModel import SpeechResModel
si_model = SpeechResModel('res15', 1881)
si_model.cuda()

SpeechResModel(
  (conv0): Conv2d(1, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv1): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn2): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv2): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn3): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv3): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn4): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv4): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2), dilation=(2, 2), bias=False)
  (bn5): BatchNorm2d(45, eps=1e-05, momentum=0.1, affine=False, track_running_stats=True)
  (conv5): Conv2d(45, 45, kernel_size=(3, 3), stride=(1, 1), padding=(2, 

In [42]:
si_model.load("si_commands_res15.pt")

loaded from si_commands_res15.pt


## Command Trial

In [37]:
import torch
from torch.autograd import Variable

def embeds(opt, val_dataloader, model, lda=None):
    val_iter = iter(val_dataloader)
    model.eval()
    splice_dim = opt['splice_frames']
    embeddings = []
    labels = []
    if lda is not None:
        print("LDA is loaded")
    for batch in (val_iter):
        x, y = batch
        time_dim = x.size(2)
        split_points = range(0, time_dim-splice_dim+1, splice_dim)
        model_outputs = []
        for point in split_points:
            x_in = Variable(x.narrow(2, point, splice_dim))
            if not opt['no_cuda']:
                x_in = x_in.cuda()
            model_outputs.append(model.embed(x_in).cpu().data)
        model_output = torch.stack(model_outputs, dim=0)
        model_output = model_output.mean(0)
        if lda is not None:
            model_output = torch.from_numpy(lda.transform(model_output.numpy()).astype(np.float32))
        embeddings.append(model_output)
        labels.append(y.numpy())
    embeddings = torch.cat(embeddings)
    labels = np.hstack(labels)
    return embeddings, labels

In [20]:
si_config['data_folder'] = "/home/muncok/DL/dataset/SV_sets/speech_commands/"
enroll_df = pd.read_pickle('../trials/commands/final/equal_num_102spk_enroll.pkl')
pos_test_df = pd.read_pickle('../trials/commands/final/equal_num_102spk_pos_test.pkl')
neg_test_df = pd.read_pickle("../trials/commands/final/equal_num_102spk_neg_test.pkl")

In [21]:
common_words = enroll_df.sent.unique().tolist()
common_words

['down',
 'eight',
 'five',
 'four',
 'go',
 'left',
 'nine',
 'no',
 'off',
 'on',
 'one',
 'right',
 'seven',
 'six',
 'stop',
 'three',
 'two',
 'up',
 'yes',
 'zero']

In [22]:
aux_words = list(set(pos_test_df.sent.unique().tolist()) - set(common_words))
aux_words

['tree',
 'marvin',
 'house',
 'cat',
 'sheila',
 'dog',
 'wow',
 'bed',
 'happy',
 'bird']

In [23]:
enroll_spks = enroll_df.spk.unique().tolist()

In [24]:
enroll_df = enroll_df.reset_index(drop=True)

### Embeddings

In [43]:
from sv_system.data.dataloader import init_default_loader
from sv_system.data.dataset import SpeechDataset

dataset = SpeechDataset.read_df(si_config, enroll_df, "test")
loader = init_default_loader(si_config, dataset, False)
enroll_embeddings, _ = embeds(si_config, loader, si_model, None)

In [44]:
dataset = SpeechDataset.read_df(si_config, pos_test_df, "test")
loader = init_default_loader(si_config, dataset, False)
pos_embedding, _ = embeds(si_config, loader, si_model, None)

In [45]:
dataset = SpeechDataset.read_df(si_config, neg_test_df, "test")
loader = init_default_loader(si_config, dataset, False)
imposter_embeddings, _ = embeds(si_config, loader, si_model, None)

### Word Not Seperated

In [46]:
n_average = 1 # number of average means the number of words in a sentence
n_enroll_uttrs = 40 # number of enroll uttrs

In [47]:
spk_models = dict()
for spk in enroll_spks:
    enroll_idx = np.nonzero(enroll_df.spk == spk)
    spk_models[spk] = enroll_embeddings[enroll_idx][:n_enroll_uttrs].mean(0, True)

In [49]:
# average embedding --> scoring

pos_scores = dict()
neg_scores = dict()

for spk in enroll_spks:
    pos_test_idx = np.nonzero((pos_test_df.spk == spk))
    pos_embeds = pos_embedding[pos_test_idx]
#     np.random.shuffle(pos_embeds)
    pos_embeds = pos_embeds.split(n_average, dim=0)
    pos_embeds = torch.stack([torch.mean(x, dim=0) for x in pos_embeds])
    pos_scores[spk] = F.cosine_similarity(pos_embeds,
                                          spk_models[spk])
   
    # negative is identical to all spks
#     np.random.shuffle(imposter_embeddings) 
    neg_embeds = imposter_embeddings.split(n_average, dim=0)
    neg_embeds = torch.stack([torch.mean(x, dim=0) for x in neg_embeds])
    
    neg_scores[spk] = F.cosine_similarity(neg_embeds, spk_models[spk])

# for spk in enroll_spks:
#     print(spk)
#     compute_eer(pos_scores[spk], neg_scores[spk])
print("\nuni")
uni_pos_scores = np.concatenate([v for v in pos_scores.values()])
uni_neg_scores = np.concatenate([v for v in neg_scores.values()])
compute_eer(uni_pos_scores, uni_neg_scores)           


uni
eer:16.143, thres:0.9505


### Word Separated

In [50]:
# for common words
n_average = 3
spk_models = dict()
pos_scores = dict()
neg_scores = dict()

for spk in enroll_spks:
    pos_scores[spk] = []
    neg_scores[spk] = []
    
for spk in enroll_spks:
    for word in common_words:
        enroll_idx = np.nonzero((enroll_df.spk == spk) & (enroll_df.sent == word))
        spk_models[spk] = enroll_embeddings[enroll_idx].mean(0, keepdim=True)

        pos_test_idx = np.nonzero((pos_test_df.spk == spk) & (pos_test_df.sent == word))
        pos_scores[spk].append(F.cosine_similarity(pos_embedding[pos_test_idx],
                                              spk_models[spk]))
         
        neg_test_idx = np.nonzero(neg_test_df.sent == word)
        neg_scores[spk].append(F.cosine_similarity(imposter_embeddings[neg_test_idx],
                                                   spk_models[spk]))    

# for aux words
spk_models = dict()
for spk in enroll_spks:
    enroll_idx = np.nonzero((enroll_df.spk == spk))
    spk_models[spk] = enroll_embeddings[enroll_idx].mean(0, keepdim=True)
    
    pos_test_idx = np.nonzero((pos_test_df.spk == spk) & (pos_test_df.sent.isin(aux_words)))
    pos_scores[spk].append(F.cosine_similarity(pos_embedding[pos_test_idx],
                                              spk_models[spk]))
    
    neg_test_idx = np.nonzero(neg_test_df.sent.isin(aux_words))
    neg_scores[spk].append(F.cosine_similarity(imposter_embeddings[neg_test_idx],
                                                   spk_models[spk]))    
    
    pos_scores[spk] = torch.cat(pos_scores[spk])
    neg_scores[spk] = torch.cat(neg_scores[spk])

# for spk in enroll_spks:
#     print(spk)
#     compute_eer(pos_scores[spk], neg_scores[spk])
print("\nuni")
uni_pos_scores = np.concatenate([v for v in pos_scores.values()])
uni_neg_scores = np.concatenate([v for v in neg_scores.values()])
compute_eer(uni_pos_scores, uni_neg_scores)      


uni
eer:16.619, thres:0.9451
