Auto-Encoder
---------------------------------

경미의 [논문](https://drive.google.com/file/d/1RArk7z4NqY5HkwkUWx4cR2ApZNnAQxdF/view?usp=sharing)에 따르면 AE가 좀더 generalize한 feature를 뽑아준다고한다. 물론 image에 대해서 실험했고 (28x28, 32x32 의 작은...) task 간의 generalization에 대해 언급해서 조금 context가 다르다.

그래서 일단 xvector의 feature를 가지고 간단한게 AE를 구현해보려고한다.

-------------------------

그런데 xvector가 PLDA만 적용해서 성능이 월등히 높다.  
다른 evaluation metric을 생각해야하지 않을까?

> 지금 dataset간의 mismatch가 문제인데 해결해 볼 수 있지 않을까?

Continual하게 speaker를 늘리는 것도 가능?  
**이를 위해서는 AE자체를 sv_set으로 학습시켰을 때 성능이 좋아지는 것을 확인해야한다.**

### Environment

In [6]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import pandas as pd
import pickle
import numpy as np
import sys
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [7]:
sys.path.append('../')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

### AE Model

In [8]:
import torch
import torch.nn as nn

class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(512, 400),
            nn.ReLU(True),
            nn.Linear(400, 300),
            nn.ReLU(True), nn.Linear(300, 256), nn.ReLU(True), nn.Linear(256, 128))
        self.decoder = nn.Sequential(
            nn.Linear(128, 256),
            nn.ReLU(True),
            nn.Linear(256, 300),
            nn.ReLU(True),
            nn.Linear(300, 400),
            nn.ReLU(True), nn.Linear(400, 512), nn.Tanh())
        
        self.latent_dim = 128

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def embed(self, x):
        x = self.encoder(x)
        return x

### Dataset

In [9]:
import torch.utils.data as data

class embedDataset(data.Dataset):
    def __init__(self, embeds, labels):
        super().__init__()
        self.embeds = embeds
        self.labels = labels
        
    def __getitem__(self, index):
        
        return self.embeds[index], self.labels[index]
    
    def __len__(self):
        
        return self.embeds.shape[0]

def embedToDataset(embeds, key_df):
    labels = key_df.label.tolist()
    dataset = embedDataset(embeds, labels)
    
    return dataset, embeds.shape[1], len(key_df.label.unique())

def key2df(keys):
    key_df = pd.DataFrame(keys, columns=['key'])
    key_df['spk'] = key_df.key.apply(lambda x: x.split("-")[0])
    key_df['label'] = key_df.groupby('spk').ngroup()
    key_df['origin'] = key_df.spk.apply(lambda x: 'voxc2' if x.startswith('id') else 'voxc1')
    
    return key_df

In [11]:
trial = pd.read_pickle("../dataset/dataframes/voxc1/voxc_trial.pkl")

# si_set
si_keys = pickle.load(open("../embeddings/voxc12/xvectors/xvectors_tdnn6b/train_feat/key.pkl", "rb"))
si_embeds = np.load("../embeddings/voxc12/xvectors/xvectors_tdnn6b/train_feat/feat.npy")
si_key_df = key2df(si_keys)

# sv_set
sv_keys = pickle.load(open("../embeddings/voxc12/xvectors/xvectors_tdnn6b/test_feat/key.pkl", "rb"))
sv_embeds = np.load("../embeddings/voxc12/xvectors/xvectors_tdnn6b/test_feat/feat.npy")
sv_key_df = key2df(sv_keys)

In [12]:
si_dataset, embed_dim, n_labels = embedToDataset(si_embeds, si_key_df)
sv_dataset, _, _ = embedToDataset(sv_embeds, sv_key_df)

In [None]:
np.save(dd)

### Training

In [13]:
import torch.nn.functional as F
from sklearn.metrics import roc_curve

def embeds_utterance(val_dataloader, model):
    embeddings = []
    labels = []
    if torch.cuda.is_available():
            model = model.cuda()
    model.eval()

    with torch.no_grad():
        for batch in val_dataloader:
            X, y = batch
            if not no_cuda:
                X = X.cuda()
                model = model.cuda()
                
            model_output = model.embed(X).cpu().detach()
            embeddings.append(model_output)
            labels.append(y.numpy())
        embeddings = torch.cat(embeddings)
        labels = np.hstack(labels)
    return embeddings, labels 

def sv_test(sv_loader, model, trial):
    embeddings, _ = embeds_utterance(sv_loader, model)
    trial_enroll = embeddings[trial.enrolment_id.tolist()]
    trial_test = embeddings[trial.test_id.tolist()]

    score_vector = F.cosine_similarity(trial_enroll, trial_test, dim=1)
    label_vector = np.array(trial.label)
    fpr, tpr, thres = roc_curve(
            label_vector, score_vector, pos_label=1)
    eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]

    return eer

In [27]:
num_epochs = 100
batch_size = 32
learning_rate = 1e-3
no_cuda = False

In [28]:
model = autoencoder().cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [29]:
from torch.utils.data.dataloader import DataLoader

si_loader = DataLoader(si_dataset, num_workers = 0, batch_size = batch_size, 
                           drop_last = True, pin_memory = True)

sv_loader = DataLoader(sv_dataset, batch_size=128, num_workers=0, shuffle=False)

In [30]:
if not no_cuda:
    model = model.cuda()
    
for epoch in range(num_epochs):
    loss_sum = 0
    total = 0
    for batch_idx, (X, _)  in enumerate(sv_loader):
        if not no_cuda:
            X = X.cuda()
        # ===================forward=====================
        output = model(X)
        loss = criterion(output, X)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        total += X.size(0)
#         if batch_idx % 1000 == 0:
#             print(f"train loss: {loss_sum/total}")
 
    # ===================log========================
    print('epoch [{}/{}], loss:{:.4f}'
          .format(epoch + 1, num_epochs, loss.item()))
    
    # =================sv_loss======================
    for batch_idx, (X, _)  in enumerate(sv_loader):
        if not no_cuda:
                X = X.cuda()
        output = model(X)
        loss = criterion(output, X)
    eer = sv_test(sv_loader, model, trial)
    print("sv loss: {:.4f}, sv eer: {:.4f}".format(loss.item(), eer))

epoch [1/100], loss:0.8605
sv loss: 0.8542, sv eer: 0.1851
epoch [2/100], loss:0.6815
sv loss: 0.6606, sv eer: 0.1815
epoch [3/100], loss:0.6327
sv loss: 0.5946, sv eer: 0.2147
epoch [4/100], loss:0.6078
sv loss: 0.5879, sv eer: 0.2803
epoch [5/100], loss:0.6087
sv loss: 0.5995, sv eer: 0.2362
epoch [6/100], loss:0.6007
sv loss: 0.5927, sv eer: 0.2738
epoch [7/100], loss:0.6011
sv loss: 0.5941, sv eer: 0.2712
epoch [8/100], loss:0.6015
sv loss: 0.5950, sv eer: 0.2755
epoch [9/100], loss:0.6004
sv loss: 0.5945, sv eer: 0.2664
epoch [10/100], loss:0.5894
sv loss: 0.5787, sv eer: 0.2706
epoch [11/100], loss:0.5962
sv loss: 0.5806, sv eer: 0.2635
epoch [12/100], loss:0.5954
sv loss: 0.5888, sv eer: 0.2482
epoch [13/100], loss:0.6062
sv loss: 0.5991, sv eer: 0.2227
epoch [14/100], loss:0.6297
sv loss: 0.6130, sv eer: 0.2334
epoch [15/100], loss:0.6115
sv loss: 0.6041, sv eer: 0.2362
epoch [16/100], loss:0.6052
sv loss: 0.5947, sv eer: 0.2452
epoch [17/100], loss:0.6020
sv loss: 0.5962, sv e

In [None]:
torch.save(model.state_dict(), open("saved_models/simple_ae_test.pt", "wb"))

### SV Test

In [None]:
# model = autoencoder()
# model.load_state_dict(torch.load(open("saved_models/simple_ae_test.pt", "rb")))