# SV Live Demo

In [1]:
%load_ext autoreload
%autoreload 2
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Speaker Verification

### Model Load

In [2]:
from resNet34Models import ResNet34, ResNet34_v1

config = dict(
    loss="softmax",
    gpu_no=[0], no_cuda=True
)

model = ResNet34_v1(config, inplanes=32)
model.load_extractor("test.pth")

extractor loaded from test.pth


###  Speaker Verification System


In [18]:
import librosa
from manage_audio import preprocess_audio
import numpy as np
import torch
import torch.nn.functional as F

class sv_system():
    def __init__(self, model, n_dims=40, feat_type='fbank'):
        self.speaker_models = dict()
        self.dct_filters = librosa.filters.dct(n_filters=n_dims, n_input=n_dims)
        self.model = model
        self.n_dims = n_dims
        self.feat_type = feat_type
        
        # for test
        self.speaker_models = self._random_speaker_model()
    
    def enrol(self, wav, spk_name):
        feat = self._wav2feat(wav)
        dvector = self._extract_dvector(feat).squeeze()
        if spk_name not in self.speaker_models:
            self.speaker_models[spk_name] = [dvector]
        else:
            self.speaker_models[spk_name] += dvector
        
    
    def _extract_dvector(self, feat):
        """
            dvector: ndarray
        """
        if feat.dim() == 2:
            feat = feat.unsqueeze(0).unsqueeze(0)
            
        dvector = self.model.embed(feat).detach().to('cpu').numpy()
        
        return dvector
    
    def _wav2feat(self, wav):
        wav_data = librosa.core.load(wav, sr=16000)[0]
        feat = preprocess_audio(wav_data, n_mels=self.n_dims, 
                    dct_filters=self.dct_filters, in_feature=self.feat_type)
        
        return feat
        
    def _random_speaker_model(self):
        random_dvector = np.random.rand(4, self.model.embed_dim)
        random_speakers = ["a", "b", "c", "d"]

        random_speaker_models = dict.fromkeys(random_speakers)

        for i, key in enumerate(random_speaker_models.keys()):
            random_speaker_models[key] = [random_dvector[i]]*2
        
        return random_speaker_models
    
    def verify(self, wav):
        feat = self._wav2feat(wav)
        test_dvector = self._extract_dvector(feat)
        # order keep?
        # averaging all dvectors for each speaker
        avg_speaker_models = np.stack([np.mean(v, axis=0) for v in self.speaker_models.values()], 
                                      axis=0)

        score = F.cosine_similarity(torch.from_numpy(avg_speaker_models).float(), 
                                    torch.from_numpy(test_dvector).float(), dim=1)
        threshold = 0.5
        pred_speaker = list(self.speaker_models.keys())[torch.argmax(score)]
        if max(score) > threshold:
            print("Accepted as {}".format(pred_speaker))
        else:
            print("Reject")
            
        

In [19]:
test_sv_system = sv_system(model)

### Enrollment

In [20]:
test_sv_system.enrol("../dataset/gcommand/gcommand_wav/bird/00f0204f_nohash_0.wav", 'new2')

### Verification &  Identification

In [21]:
test_sv_system.verify("../dataset/gcommand/gcommand_wav/bird/02e85b60_nohash_2.wav")

Accepted as new2
