# DDA (Deep Discriminant Analysis)

기존의 DDA 학습은 classification base였다면 이번에는 Metric learning을 이용하여 DDA를 수행해보자.

----

Various length
-----

input frames을 100f, 200f, 400f, 800f로 잘라놓고

긴것을 앵커로 놓고 작은 것을 positive negative로 놓아 어떻게 되는지 보려고 한다.

### Environment

In [1]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import pandas as pd
import pickle
import numpy as np
import sys
import os

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [2]:
sys.path.append('../../sv_system/')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

### Configuration

In [3]:
from utils.parser import set_train_config
import easydict

# datasets
# voxc1_fbank_xvector
# gcommand_fbank_xvector

args = easydict.EasyDict(dict(dataset="voxc1_fbank_xvector",
                              input_frames=100, splice_frames=[50, 100], stride_frames=1, input_format='fbank',
                              cuda=True,
                              lrs=[0.1, 0.01], lr_schedule=[20], seed=1337,
                              no_eer=False,
                              batch_size=128,
                              arch="tdnn_conv", loss="softmax",
                              n_epochs=50
                             ))
config = set_train_config(args)

### Dataset

In [4]:
import torch.utils.data as data

class embedDataset(data.Dataset):
    def __init__(self, embeds, labels):
        super().__init__()
        self.embeds = embeds
        self.labels = labels
        
    def __getitem__(self, index):
        
        return self.embeds[index], self.labels[index]
    
    def __len__(self):
        
        return self.embeds.shape[0]

In [5]:
def embedToDataset(embeds, key_df):
    labels = key_df.label.tolist()
    dataset = embedDataset(embeds, labels)
    
    return dataset, embeds.shape[1], len(key_df.label.unique())

In [6]:
def key2df(keys):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split("-")[0])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['origin'] = key_df.spk.apply(lambda x: 'voxc2' if x.startswith('id') else 'voxc1')
    
    return key_df

In [7]:
def refeat_df(key_df, n_repeat, suffix):
    repeat_key_df = key_df.loc[key_df.index.repeat(n_repeat)]
    repeat_key_df = repeat_key_df.set_index('key')
    repeat_key_df.index = repeat_key_df.index + '-' + suffix + '-' + \
                    repeat_key_df.groupby(level=0).cumcount().astype(str) 
    
    return repeat_key_df

In [8]:
trial = pd.read_pickle("/dataset/SV_sets/voxceleb12/dataframes/voxc12_test_trial.pkl")

In [9]:
si_keys = pickle.load(open("./embeddings_for_dda/voxc1_mfcc30/100f_embeds/si_keys.pkl", "rb"))
sv_keys = pickle.load(open("./embeddings_for_dda/voxc1_mfcc30/100f_embeds/sv_keys.pkl", "rb"))

In [10]:
si_100f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/100f_embeds/si_embeds.npy")
sv_100f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/100f_embeds/sv_embeds.npy")

In [11]:
si_200f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/200f_embeds/si_embeds.npy")
sv_200f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/200f_embeds/sv_embeds.npy")

In [12]:
si_400f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/400f_embeds/si_embeds.npy")
sv_400f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/400f_embeds/sv_embeds.npy")

In [13]:
si_800f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/800f_embeds/si_embeds.npy")
sv_800f_embeds = np.load("./embeddings_for_dda/voxc1_mfcc30/800f_embeds/sv_embeds.npy")

In [14]:
si_key_df = key2df(si_keys)
sv_key_df = key2df(sv_keys)

In [15]:
si_key_100f_df = refeat_df(si_key_df, 8, '100f')
si_key_200f_df = refeat_df(si_key_df, 4, '200f')
si_key_400f_df = refeat_df(si_key_df, 2, '400f')
si_key_800f_df = refeat_df(si_key_df, 1, '800f')

### Choose a embedding

In [44]:
si_key_df = si_key_800f_df
si_embeds = si_800f_embeds
sv_embeds = sv_800f_embeds

### LDA on embedding

In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

global_mean = si_embeds.mean(0)
clf = LDA(solver='svd', n_components=200)
clf.fit(si_embeds - global_mean, si_key_df.label)

LinearDiscriminantAnalysis(n_components=200, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [18]:
si_embeds = clf.transform(si_embeds - global_mean).astype(np.float32)

In [19]:
sv_embeds = clf.transform(sv_embeds - global_mean).astype(np.float32)

In [20]:
si_dataset, embed_dim, n_labels = embedToDataset(si_embeds.reshape(-1,200), si_key_df)
sv_dataset, _, _ = embedToDataset(sv_embeds, sv_key_df)

### Batch Sampler

In [45]:
import math
import random
import itertools

def index_dataset(dataset):
    return {c : [example_idx for example_idx, (_, class_label_ind) in \
                 enumerate(zip(dataset.embeds, dataset.labels)) if class_label_ind == c] for c in set(dataset.labels)}

def sample_from_class(images_by_class, class_label_ind):
    return images_by_class[class_label_ind][random.randrange(len(images_by_class[class_label_ind]))]

def simple(batch_size, dataset, prob_other = 0.5):
    '''lazy sampling, not like in lifted_struct. they add to the pool all postiive combinations, then compute the average number of positive pairs per image, then sample for every image the same number of negative pairs'''
    images_by_class = index_dataset(dataset)
    for batch_idx in range(int(math.ceil(len(dataset) * 1.0 / batch_size))):
        example_indices = []
        for i in range(0, batch_size, 2):
            perm = random.sample(images_by_class.keys(), 2)
            example_indices += [sample_from_class(images_by_class, perm[0]), sample_from_class(images_by_class, perm[0 if i == 0 or random.random() > prob_other else 1])]
        yield example_indices[:batch_size]

def triplet(batch_size, dataset, class2img=None):
    if class2img is not None:
        images_by_class = class2img
    else:
        images_by_class = index_dataset(dataset)
        
    for batch_idx in range(int(math.ceil(len(dataset) * 1.0 / batch_size))):
        example_indices = []
        for i in range(0, batch_size, 3):
            perm = random.sample(images_by_class.keys(), 2)
            example_indices += [sample_from_class(images_by_class, perm[0]), sample_from_class(images_by_class, perm[0]), sample_from_class(images_by_class, perm[1])]
        yield example_indices[:batch_size]

def npairs(batch_size, dataset, K = 4):
    images_by_class = index_dataset(dataset)
    for batch_idx in range(int(math.ceil(len(dataset) * 1.0 / batch_size))):
        example_indices = [sample_from_class(images_by_class, class_label_ind) for k in range(int(math.ceil(batch_size * 1.0 / K))) for class_label_ind in [random.choice(images_by_class.keys())] for i in range(K)]
        yield example_indices[:batch_size]

### Dataloader

In [46]:
n_pairs_per_batch = 23
batch_size = n_pairs_per_batch * 3

In [47]:
adapt_sampler = lambda batch, dataset, sampler, **kwargs: \
type('', (torch.utils.data.sampler.Sampler,), 
     dict(__len__ = dataset.__len__, __iter__ = \
          lambda _: itertools.chain.from_iterable(sampler(batch, dataset, **kwargs))))(dataset)

In [48]:
# use values to exclude unnecessary index
si_key_df['num_id'] = range(len(si_key_df))
class2idx = si_key_df.groupby('label').apply(lambda x: x.num_id.values).to_dict()

In [49]:
class2idx[0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76])

In [50]:
import torch
from torch.utils.data.dataloader import DataLoader

si_loader = torch.utils.data.DataLoader(
    si_dataset, 
    sampler = adapt_sampler(batch_size, 
                           si_dataset, 
                           triplet, 
                           class2img=class2idx), 
    num_workers = 8, batch_size = batch_size, 
    drop_last = True, pin_memory = True
)

sv_loader = DataLoader(sv_dataset, batch_size=128, num_workers=4, shuffle=False)

### Model Define

In [51]:
import torch.nn as nn

class dda_model(nn.Module):
    def __init__(self, in_dims, n_labels):
        super().__init__()
                
        hid_dims = in_dims*2
        out_dims = in_dims
        
        self.hidden_layer = nn.Sequential(
            nn.Linear(in_dims, hid_dims),
            nn.BatchNorm1d(hid_dims),
            nn.PReLU(),
            nn.Linear(hid_dims, hid_dims),
            nn.BatchNorm1d(hid_dims),
            nn.PReLU(),
        )    
    
        self.embedding_layer = nn.Linear(hid_dims, out_dims)
        
    def embed(self, x):
        x = self.hidden_layer(x)
    
        return x
    
    def forward(self, x):           
        x = self.embed(x)
        
        return x

### Metric Learning

In [52]:
def hard_mining(anchor, pos_egs, neg_egs, margin=1.0):
    pos_dist = (anchor - pos_egs).pow(2).sum(1)
    pos_dist = torch.clamp(pos_dist, min=1e-16)
    pos_dist = pos_dist.sqrt()
    
    neg_dist = (anchor - neg_egs).pow(2).sum(1)
    neg_dist = torch.clamp(neg_dist, min=1e-16)
    neg_dist = neg_dist.sqrt()
    
    
    hard_pos_dist = pos_dist.max()
    hard_neg_dist = neg_dist.min()
    
#     print(f"hard_pos:{hard_pos_dist}, hard_neg:{hard_neg_dist}")
    
    triplet_loss = torch.clamp(hard_pos_dist - hard_neg_dist + margin, min=0)
    triplet_loss = torch.sum(triplet_loss)
    
    return triplet_loss

In [53]:
model = dda_model(embed_dim, n_labels) 

if not config['no_cuda']:
    model = model.cuda()

In [57]:
from train.train_utils import set_seed, find_optimizer
from torch.optim.lr_scheduler import ReduceLROnPlateau, MultiStepLR
from sklearn.metrics import roc_curve
import torch.nn.functional as F
from tensorboardX import SummaryWriter

config['lrs'] = [0.01]
_, optimizer = find_optimizer(config, model)
criterion = nn.TripletMarginLoss(margin=10, p=2)
# criterion = hard_mining
# criterion = nn.CosineEmbeddingLoss(margin=0.5)
plateau_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5)
step_scheduler = MultiStepLR(optimizer, [30], 0.1)

writer = SummaryWriter("logs/xvector_eucl_semi_hard_m0.4_lr0.01.tf.log")

best_eer = 1.0
for epoch_idx in range(0, config['n_epochs']):
    print("-"*30)
    curr_lr = optimizer.state_dict()['param_groups'][0]['lr']
    print("curr_lr: {}".format(curr_lr))
    
#=============== train code #===============
    model.train()
    loss_sum = 0
    n_corrects = 0
    total = 0
    for batch_idx, (X, y) in enumerate(si_loader):
        if not config['no_cuda']:
            X = X.cuda()
            y = y.cuda()

        optimizer.zero_grad()
        n_sub_utter = X.size(1)
        embeds = model(X)
#         embeds = embeds / embeds.norm(dim=1,keepdim=True)
        anchor = embeds[0:batch_size:3]
        pos_egs = embeds[1:batch_size:3]
        neg_egs = embeds[2:batch_size:3]
        loss = criterion(anchor, pos_egs, neg_egs)
        
#         loss_pos = criterion(anchor, pos_egs, torch.ones(len(anchor)).cuda())
#         loss_neg = criterion(anchor, neg_egs, torch.zeros(len(anchor)).cuda())
#         loss = loss_pos + loss_neg
        loss.backward()
        optimizer.step()
                        
        loss_sum += loss.item()
        total += y.size(0)
        if (batch_idx+1) % 1000 == 0:
            print("Batch {}/{}\t Loss {:.6f}" \
                  .format(batch_idx+1, len(si_loader), loss_sum / total))
    train_loss = loss_sum / total
    plateau_scheduler.step(train_loss)
    
    print("epoch #{}, train loss: {}".format(epoch_idx, train_loss))
    writer.add_scalar("train/loss", train_loss, epoch_idx+1)

#=============== test code #===============
    embeddings = []
    model.eval()

    with torch.no_grad():
        for batch in sv_loader:
            X, _ = batch
            if not config['no_cuda']:
                X = X.cuda()
                
            model_output = model.embed(X).cpu().detach()
            embeddings.append(model_output)
        embeddings = torch.cat(embeddings)

        score_vector = F.cosine_similarity(embeddings[trial.enrolment_id],
                                      embeddings[trial.test_id], dim=1)
    label_vector = np.array(trial.label)
    fpr, tpr, thres = roc_curve(
            label_vector, score_vector, pos_label=1)
    eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]
    
    if eer < best_eer:
        best_eer = eer
        print("best eer!")
        torch.save(model.state_dict(), open("best_model_metric.pt", "wb"))
        
    print("epoch #{}, sv eer: {}".format(epoch_idx, eer))
    writer.add_scalar("sv_test/eer", eer, epoch_idx+1)

------------------------------
curr_lr: 0.01
Batch 1000/2154	 Loss 0.002616
Batch 2000/2154	 Loss 0.002812
epoch #0, train loss: 0.002796055533546273
epoch #0, sv eer: 0.10053022269353128
------------------------------
curr_lr: 0.01
Batch 1000/2154	 Loss 0.002881
Batch 2000/2154	 Loss 0.002982
epoch #1, train loss: 0.0030098189624430745
epoch #1, sv eer: 0.09379639448568398
------------------------------
curr_lr: 0.01
Batch 1000/2154	 Loss 0.003185
Batch 2000/2154	 Loss 0.003285
epoch #2, train loss: 0.003264187659026811
epoch #2, sv eer: 0.0988865323435843
------------------------------
curr_lr: 0.01
Batch 1000/2154	 Loss 0.003650
Batch 2000/2154	 Loss 0.003431
epoch #3, train loss: 0.0033681340840181597
epoch #3, sv eer: 0.09056203605514317
------------------------------
curr_lr: 0.01
Batch 1000/2154	 Loss 0.003021
Batch 2000/2154	 Loss 0.003302
epoch #4, train loss: 0.0032842543512307536
epoch #4, sv eer: 0.09056203605514317
------------------------------
curr_lr: 0.01
Batch 1000/21

Process Process-1961:
Process Process-1960:
Process Process-1958:
Traceback (most recent call last):
Process Process-1957:
Process Process-1959:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Process Process-1964:
Process Process-1963:
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-1962:
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/py

  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/connection.py", line 911, in wait
    ready = selector.select(timeout)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/selectors.py", line 376, in select
    fd_event_list = self._poll.poll(timeout)
KeyboardInterrupt
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/queues.py", line 104, in get
    if not self._poll(timeout):
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/connection.py", line 257, in poll
    return self._poll(timeout)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/connection.py", line 414, in _poll
    r = wait([self], timeout)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.

KeyboardInterrupt: 

In [None]:
torch.norm((anchor - pos_egs), dim=1).sqrt() - torch.norm((anchor - neg_egs), dim=1).sqrt() + 0.4

In [None]:
torch.norm((anchor_input - pos_input), dim=1) - torch.norm((anchor_input - neg_input), dim=1)

In [None]:
sv_embeds = torch.from_numpy(sv_dataset.embeds)

In [None]:
score_vector = F.cosine_similarity(sv_embeds[trial.enrolment_id],
                                  sv_embeds[trial.test_id], dim=1)
label_vector = np.array(trial.label)
fpr, tpr, thres = roc_curve(
        label_vector, score_vector, pos_label=1)
eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]
print("epoch #{}, sv eer: {}".format(epoch_idx, eer))

In [58]:
y

tensor([ 392,  392,  260, 1159, 1159,  937,  990,  990, 1082,  213,  213,  771,
         650,  650,  381, 1075, 1075, 1141,  780,  780,  879,  568,  568,  355,
         432,  432,  270,  132,  132, 1035,  199,  199,  650,  549,  549, 1156,
         421,  421,  129,  621,  621,  964,  140,  140, 1153,  998,  998,  848,
        1161, 1161, 1209,  769,  769,  495,   95,   95,  391,   77,   77,  753,
          47,   47,  415, 1117, 1117,  752,  763,  763,  522],
       device='cuda:0')

### softmax Learning