만약 sent label로 같이 MTL을 하면

같은 sent일 때는 좀더 성능이 좋아질지?

혹은 positive도 강화되지만 negative도 강화될지?

In [11]:
%load_ext autoreload
%autoreload 2
%pylab
%matplotlib inline

import pandas as pd
import pickle
import numpy as np
import sys
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib


In [12]:
sys.path.append('../')
sys.path.append('../sv_system/')
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

### Configuration

In [13]:
from sv_system.utils.parser import set_train_config
import easydict
args = easydict.EasyDict(dict(dataset="gcommand_equal30_wav",
                              input_frames=100, splice_frames=[20, 100], stride_frames=1, input_format='fbank',
                              cuda=True,
                              lrs=[0.1, 0.01], lr_schedule=[20], seed=1337,
                              no_eer=False,
                              batch_size=128,
                              arch="ResNet34", loss="softmax",
                              n_epochs=50,
                              lamb=0.3
                             ))
config = set_train_config(args)

### Dataset and Dataloader

In [14]:
gc_si_df = pd.read_pickle("../dataset/dataframes/gcommand/equal_num_30spk/equal_num_30spk_si.pkl")
gc_sv_df = pd.read_pickle("../dataset/dataframes/gcommand/equal_num_30spk/equal_num_30spk_sv.pkl")

unique_spks = gc_si_df.spk.unique().tolist()
unique_sents = gc_si_df.sent.unique().tolist()

gc_si_df['sent_label'] = gc_si_df.sent.apply(lambda x: unique_sents.index(x))
gc_sv_df['sent_label'] = gc_sv_df.sent.apply(lambda x: unique_sents.index(x))

In [15]:
from sv_system.data.data_utils import find_dataset, find_trial


_, datasets = find_dataset(config, basedir='../')
trial = find_trial(config, basedir='../')

In [16]:
from sv_system.data.dataset import mtlSpeechDataset

train_dataset = mtlSpeechDataset.read_df(config, gc_si_df, "train")
datasets[0] = train_dataset

In [17]:
from sv_system.data.dataloader import init_loaders

dataloaders = init_loaders(config, datasets)

### Define Model

In [18]:
from sv_system.model.ResNet34 import ResNet34
import torch.nn as nn
import torch.nn.functional as F

class ResNet34_v1(ResNet34):
    """
        additional fc layer before output layer
    """
    def __init__(self, config, inplanes=16, n_labels1=1000, n_labels2=1000, fc_dims=None):
        super().__init__(config, inplanes, 10)

        extractor_output_dim = 8*inplanes
        if not fc_dims:
            fc_dims = extractor_output_dim

        fc = [nn.Linear(extractor_output_dim,fc_dims),
                  nn.ReLU(inplace=True)]

        self.fc = nn.Sequential(*fc)
        
        self.classifier_1 = nn.Linear(fc_dims, n_labels1) # for spks
        self.classifier_2 = nn.Linear(fc_dims, n_labels2) # for sents
    
    def extract(self, x):
        x = self.extractor(x)
        feat = self.fc(x)
        
        return feat
        
    def forward(self, x):
        feat = self.extract(x)
        
        x = F.avg_pool2d(feat,feat.shape[-2:])
        x = x.view(x.size(0), -1)
        out1 = self.classifier_1(x)
        
        x = feat.view(feat.size(0), -1)
        out2 = self.classifier_2(x)
        
        return out1, out2
        
    def spk_out(self, x):
        feat = self.extract(x)
        
        x = F.avg_pool2d(feat,feat.shape[-2:])
        x = x.view(x.size(0), -1)
        out1 = self.classifier_1(x)
        
        return out1
    
    def sent_out(self, x):
        feat = self.extract(x)
        
        x = feat.view(feat.size(0), -1)
        out2 = self.classifier_2(x)
        
        return out2


In [19]:
from sv_system.model.tdnnModel import gTDNN, st_pool_layer

class tdnn_xvector(gTDNN):
    """xvector architecture"""
    def __init__(self, config, n_labels_spk, n_labels_sent):
        super(tdnn_xvector, self).__init__(config, n_labels_spk)
        inDim = config['input_dim']
        self.extractor = nn.Sequential(
            nn.Conv1d(inDim, 512, stride=1, dilation=1, kernel_size=5),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Conv1d(512, 512, stride=1, dilation=3, kernel_size=3),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Conv1d(512, 512, stride=1, dilation=4, kernel_size=3),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Conv1d(512, 512, stride=1, dilation=1, kernel_size=1),
            nn.BatchNorm1d(512),
            nn.ReLU(True)
        )
        
        self.spk_seg = nn.Sequential(
            nn.Conv1d(512, 1500, stride=1, dilation=1, kernel_size=1),
            nn.BatchNorm1d(1500),
            nn.ReLU(True),
            st_pool_layer(),
            nn.Linear(3000, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            ####################
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Linear(512, n_labels_spk),
        )
        
        self.sent_seg = nn.Sequential(
            nn.Conv1d(512, 1500, stride=1, dilation=1, kernel_size=1),
            nn.BatchNorm1d(1500),
            nn.ReLU(True),
            st_pool_layer(),
            nn.Linear(3000, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            ##################3333333##
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Linear(512, n_labels_sent),
        )

        self._initialize_weights()

    def embed(self, x):
        x = x.squeeze(1)
        # (batch, time, freq) -> (batch, freq, time)
        x = x.permute(0,2,1)
        x = self.extractor(x)
        x = self.spk_seg(x)

        return x
    
    def sent_out(self, x):
        x = x.squeeze(1)
        x = x.permute(0,2,1)
        feat = self.extractor(x)
        out_sent = self.sent_seg(feat)
        
        return out_sent
    
    def spk_out(self, x):
        feat = self.embed(x)
        out_spk = self.classifier(feat)

        return out_spk

    def forward(self, x):
        x = self.spk_out(x)

        return x


In [40]:
model = tdnn_xvector(config,  len(unique_spks), len(unique_sents))
# model = ResNet34_v1(config, n_labels1=len(unique_spks), n_labels2=len(unique_sents), fc_dims=256)

In [41]:
if not config["no_cuda"]:
    model.cuda()
else:
    model = model.cpu()

### Model Train

In [42]:
import torch
from sv_system.train.train_utils import set_seed, find_optimizer
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion, joint_optimizer = find_optimizer(config, model)

sent_optimizer = torch.optim.SGD([{'params':model.extractor.parameters()}, 
                                  {'params':model.sent_seg.parameters()}], 
                                lr=0.01,
                                momentum=config['momentum'],
                                weight_decay=config['weight_decay'],
                                nesterov=config['use_nesterov'])

spk_optimizer = torch.optim.SGD([{'params':model.extractor.parameters()}, 
                                 {'params':model.spk_seg.parameters()},
                                 {'params':model.classifier.parameters()}],
                                lr=0.1,
                                momentum=config['momentum'],
                                weight_decay=config['weight_decay'],
                                nesterov=config['use_nesterov'])


scheduler = ReduceLROnPlateau(spk_optimizer, 'min', factor=0.1, patience=5)

In [43]:
set_seed(config)

In [44]:
if not config['no_eer']:
    train_loader, val_loader, test_loader, sv_loader = dataloaders
else:
    train_loader, val_loader, test_loader = dataloaders

In [45]:
# mtl trai
from tqdm import tqdm_notebook
from sv_system.train.train_utils import print_eval

def sent_train(config, train_loader, model, optimizer, criterion):
    model.train()
    loss_sent_sum = 0
    corrects = 0
    total = 0
    lamb = config['lamb']
    print_steps = (np.array([0.25, 0.5, 0.75, 1.0]) \
                    * len(train_loader)).astype(np.int64)

    splice_frames = config['splice_frames']
    if len(splice_frames) > 1:
        splice_frames_ = np.random.randint(splice_frames[0], splice_frames[1])
    else:
        splice_frames_ = splice_frames[-1]

    for batch_idx, (X, y_spk, y_sent) in enumerate(train_loader):
        # X.shape is (batch, channel, time, bank)
        X = X.narrow(2, 0, splice_frames_)
        if not config["no_cuda"]:
            X = X.cuda()
            y_sent = y_sent.cuda()
        optimizer.zero_grad()
        logit = model.sent_out(X)
        loss_sent = criterion(logit, y_sent)
        loss_sent_sum += loss_sent.item()
        loss_sent.backward()
        optimizer.step()
        predicted = torch.argmax(logit, dim=1)
        corrects += predicted.eq(y_sent).cpu().sum().float()
        total += y_sent.size(0)
        if batch_idx in print_steps:
            print(" sent_loss: {:.4f}, acc: {:.5f} " \
                  .format(loss_sent_sum/total, corrects/total))
        
    return loss_sent_sum, corrects/total

In [46]:
def spk_train(config, train_loader, model, optimizer, criterion):
    model.train()
    loss_spk_sum = 0
    corrects = 0
    total = 0
    lamb = config['lamb']
    print_steps = (np.array([0.25, 0.5, 0.75, 1.0]) \
                    * len(train_loader)).astype(np.int64)

    splice_frames = config['splice_frames']
    if len(splice_frames) > 1:
        splice_frames_ = np.random.randint(splice_frames[0], splice_frames[1])
    else:
        splice_frames_ = splice_frames[-1]

    for batch_idx, (X, y_spk, y_sent) in enumerate(train_loader):
        # X.shape is (batch, channel, time, bank)
        X = X.narrow(2, 0, splice_frames_)
        if not config["no_cuda"]:
            X = X.cuda()
            y_spk = y_spk.cuda()
        optimizer.zero_grad()
        logit = model.spk_out(X)
        loss_spk = criterion(logit, y_spk)
        loss_spk_sum += loss_spk.item()
        loss_spk.backward()
        optimizer.step()
        predicted = torch.argmax(logit, dim=1)
        corrects += predicted.eq(y_spk).cpu().sum().float()
        total += y_spk.size(0)
        if batch_idx in print_steps:
            print(" spk_loss: {:.4f}, acc: {:.5f} " \
                  .format(loss_spk_sum/total, corrects/total))
            
    return loss_spk_sum, corrects/total

In [47]:
# mtl trai
from tqdm import tqdm_notebook
from sv_system.train.train_utils import print_eval

def joint_train(config, train_loader, model, optimizer, criterion):
    model.train()
    loss_sum = 0
    loss_spk_sum = 0
    loss_sent_sum = 0
    corrects = 0
    total = 0
    lamb = config['lamb']
    print_steps = (np.array([0.25, 0.5, 0.75, 1.0]) \
                    * len(train_loader)).astype(np.int64)

    splice_frames = config['splice_frames']
    if len(splice_frames) > 1:
        splice_frames_ = np.random.randint(splice_frames[0], splice_frames[1])
    else:
        splice_frames_ = splice_frames[-1]

    for batch_idx, (X, y_spk, y_sent) in enumerate(train_loader):
        # X.shape is (batch, channel, time, bank)
        X = X.narrow(2, 0, splice_frames_)
        if not config["no_cuda"]:
            X = X.cuda()
            y_spk = y_spk.cuda()
            y_sent = y_sent.cuda()
        optimizer.zero_grad()
        logit1, logit2 = model.multi_out(X)
        loss_spk = criterion(logit1, y_spk) * (1-lamb)
        loss_sent = criterion(logit2, y_sent) * (lamb)
        loss = loss_spk + loss_sent 
        loss_sum += loss.item()
        loss_spk_sum += loss_spk.item()
        loss_sent_sum += loss_sent.item()
        loss.backward()
        optimizer.step()
        predicted = torch.argmax(logit1, dim=1)
        corrects += predicted.eq(y_spk).cpu().sum().float()
        total += y_spk.size(0)
        if batch_idx in print_steps:
            print("train loss: {:.4f}, spk_loss: {:.4f}, sent_loss: {:.4f}, acc: {:.5f} " \
                  .format(corrects/total, loss_sum/total,
                          loss_spk_sum/total, loss_sent_sum/total))
            
    return loss_sum, corrects/total

In [None]:
import torch
from sv_system.train.si_train import val, sv_test

print("lamb: {}".format(config['lamb']))
for epoch_idx in range(0, config['n_epochs']):
    print("-"*30)
#     curr_lr = optimizer.state_dict()['param_groups'][0]['lr']
    
#     #lr_scheduling
#     idx = 0
#     while(epoch_idx >= config['lr_schedule'][idx]):
#     # use new lr from schedule epoch not a next epoch
#         idx += 1
#         if idx == len(config['lr_schedule']):
#             break
#     curr_lr = config['lrs'][idx]
#     optimizer.state_dict()['param_groups'][0]['lr'] = curr_lr
#     print("curr_lr: {}".format(curr_lr))

#     train code
    if epoch_idx < 0:
        train_loss, train_acc = sent_train(config, train_loader, model, sent_optimizer, criterion)
    else:
        if epoch_idx % 2 != 0:
            train_loss, train_acc = spk_train(config, train_loader, model, spk_optimizer, criterion)
        else:
            train_loss, train_acc = sent_train(config, train_loader, model, sent_optimizer, criterion)
            
    

#     validation code
    val_loss, val_acc = val(config, val_loader, model, criterion)
    
    print("epoch #{}, train accuracy: {}".format(epoch_idx, train_acc))
    print("epoch #{}, val accuracy: {}".format(epoch_idx, val_acc))

#     evaluate best_metric
    if not config['no_eer']:
        # eer validation code
        eer, label, score = sv_test(config, sv_loader, model, trial)
        print("epoch #{}, sv eer: {}".format(epoch_idx, eer))
    
    scheduler.step(train_loss)

lamb: 0.3
------------------------------
 sent_loss: 0.0177, acc: 0.33496 
 sent_loss: 0.0151, acc: 0.42685 
 sent_loss: 0.0138, acc: 0.47150 
epoch #0, train accuracy: 0.4984186291694641
epoch #0, val accuracy: 0.00022321428696159273
epoch #0, sv eer: 0.3740777777777778
------------------------------
 spk_loss: 0.0525, acc: 0.02885 
 spk_loss: 0.0479, acc: 0.05642 
 spk_loss: 0.0446, acc: 0.07700 
epoch #1, train accuracy: 0.09969263523817062
epoch #1, val accuracy: 0.1490459442138672
epoch #1, sv eer: 0.20202222222222221
------------------------------
 sent_loss: 0.0099, acc: 0.62580 
 sent_loss: 0.0082, acc: 0.68723 
 sent_loss: 0.0075, acc: 0.71301 
epoch #2, train accuracy: 0.7301216125488281
epoch #2, val accuracy: 0.07721054553985596
epoch #2, sv eer: 0.26166666666666666
------------------------------
 spk_loss: 0.0336, acc: 0.15936 
 spk_loss: 0.0323, acc: 0.17915 
 spk_loss: 0.0313, acc: 0.19484 
epoch #3, train accuracy: 0.20798254013061523
epoch #3, val accuracy: 0.143296360

Process Process-1975:
Process Process-1978:
Process Process-1972:
Process Process-1982:
Process Process-1984:
Process Process-1970:
Process Process-1971:
Traceback (most recent call last):
Process Process-1969:
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process Process-1983:
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._arg

Process Process-1974:
Process Process-1979:
  File "../sv_system/data/dataset.py", line 382, in preprocess
    input_feature = preprocess_audio(data, self.n_mels, self.filters, self.input_format)
KeyboardInterrupt
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "../sv_system/data/dataset.py", line 400, in __getitem__
    return self.preprocess(os.path.join(self.data_folder, self.audio_files[index])), \
  File "../sv_system/data/manage_audio.py", line 22, in preprocess_audio
    data = librosa.feature.melspectrogram(data, sr=16000, n_mels=n_mels, hop_length=160, n_fft=480, fmin=20, fmax=4000)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/librosa/feature/spectral.py", line 1491, in melspectrogram
    power=power)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/envs/pytorch-py3.6

  File "../sv_system/data/dataset.py", line 400, in __getitem__
    return self.preprocess(os.path.join(self.data_folder, self.audio_files[index])), \
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/librosa/core/spectrum.py", line 183, in stft
    y_frames[:, bl_s:bl_t],
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/librosa/core/audio.py", line 112, in load
    with audioread.audio_open(os.path.realpath(path)) as input_file:
  File "../sv_system/data/dataset.py", line 382, in preprocess
    input_feature = preprocess_audio(data, self.n_mels, self.filters, self.input_format)
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/

KeyboardInterrupt: 

  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/librosa/core/audio.py", line 126, in load
    for frame in input_file:
  File "../sv_system/data/manage_audio.py", line 22, in preprocess_audio
    data = librosa.feature.melspectrogram(data, sr=16000, n_mels=n_mels, hop_length=160, n_fft=480, fmin=20, fmax=4000)
  File "../sv_system/data/manage_audio.py", line 22, in preprocess_audio
    data = librosa.feature.melspectrogram(data, sr=16000, n_mels=n_mels, hop_length=160, n_fft=480, fmin=20, fmax=4000)
  File "../sv_system/data/dataset.py", line 368, in preprocess
    data = librosa.core.load(example, sr=16000)[0]
KeyboardInterrupt
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/audioread/__init__.py", line 80, in audio_open
    return rawread.RawAudioFile(path)
KeyboardInterrupt
  File "/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/librosa/feature/spectral.py", line 1494, in melspectrogram
    mel_basis = filters.mel(sr, n_fft, **kwargs)
  

In [None]:
torch.save(model.cpu().state_dict(), open("gcommand_ResNet34_v1_mtl_lamb0.3.pt", "wb"))

### SV_Test

equal_sent and diff_sent

In [None]:
from sv_system.sv_score.score_utils import embeds_utterance

def sv_test(config, sv_loader, model, trial):
    embeddings, _ = embeds_utterance(config, sv_loader, model, lda=None)
    sim_matrix = F.cosine_similarity(
            embeddings.unsqueeze(1), embeddings.unsqueeze(0), dim=2)
    cord = [trial.enrolment_id.tolist(), trial.test_id.tolist()]
    score_vector = sim_matrix[cord].numpy()
    label_vector = np.array(trial.label)
    fpr, tpr, thres = roc_curve(
            label_vector, score_vector, pos_label=1)
    eer = fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]

    return eer, label_vector, score_vector

In [None]:
equal_sent_trial = trial[trial.equal_command]
diff_sent_trial = trial[~trial.equal_command]

In [None]:
model.cuda()
equal_sent_eer, _, _ = sv_test(config, sv_loader, model, equal_sent_trial)
diff_sent_eer, _, _ = sv_test(config, sv_loader, model, diff_sent_trial)

In [None]:
# gcommand_ResNet34_v1_mtl_lamb0.1.pt
print(f"equal: {equal_sent_eer}\ndiff: {diff_sent_eer}")

In [None]:
config['lamb']