In [13]:
%load_ext autoreload
%autoreload 2
%pylab inline
import numpy as np
import pandas as pd
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


In [14]:
import librosa
import sys
import pickle
# sys.setrecursionlimit(10000) # 10000 is an example, try with different values
sys.path.append("/home/muncok/DL/projects/sv_system/")

In [15]:
from dnn.si_train import set_seed
from pydub import AudioSegment

In [16]:
from dnn.train.model import init_protonet, init_resnet
from dnn.data.dataloader import init_sv_loaders
from dnn.sv_score import similarities
from sklearn.metrics import roc_curve

In [17]:
def draw_in_feature(config, sound_files, in_feature='mfcc'):
    t = len(sound_files)
    plt.figure(figsize=(12, 4*t))
    for i, sound_file in enumerate(sound_files):
        feature = preprocess_from_path(config, sound_file, in_feature)
        plt.subplot(t,1,i+1)
        librosa.display.specshow(feature.numpy().T, 
                                 x_axis='time', y_axis='mel',
                                 sr=16000, hop_length=160, 
                                 fmin=20, fmax=4000)
        plt.colorbar()
        word = sound_file.split("/")[-2]
        file = sound_file.split("/")[-1]
        plt.title("{}, {}, {}".format(in_feature, word, file))
        plt.tight_layout()

In [18]:
def secToSample(sec):
    return int(16000 * sec)

In [19]:
def compute_eer(pos_scores, neg_scores):
    score_vector = np.concatenate([pos_scores, neg_scores])
    label_vector = np.concatenate([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
    fpr, tpr, thres = roc_curve(label_vector, score_vector, pos_label=1)
    eer = np.min([fpr[np.nanargmin(np.abs(fpr - (1 - tpr)))], 
                 1-tpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]])
    thres = thres[np.nanargmin(np.abs(fpr - (1 - tpr)))]
    print("eer:{:.4f}, thres:{:.4f}".format(eer, thres))

In [20]:
dataframe_dir = '/home/muncok/DL/dataset/SV_sets/dataframes/'
data_dir = '/home/muncok/DL/dataset/SV_sets/speech_commands/vad/'
data_df = pd.read_pickle('/home/muncok/DL/dataset/SV_sets/dataframes/Command_Dataframe.pkl')

In [21]:
from dnn.parser import get_sv_parser
options = get_sv_parser().parse_args(args=[])
options.n_dct_filters = 40
options.n_mels = 40
options.timeshift_ms = 100
options.data_folder = "/home/muncok/DL/dataset/SV_sets"
options.window_size= 0.025
options.window_stride= 0.010
options.cache_size = 32768
options.input_format = "mfcc"

In [22]:
options.input_format = 'mfcc'
# options.input_length = secToSample(1)
options.splice_frame = secToSample(0.1)//160+1

In [25]:
import torch.nn.functional as F
from dnn.data.dataloader import init_embed_loaders
options.data_folder= "/home/muncok/DL/dataset/SV_sets/speech_commands/vad"

### SI_Model

In [30]:
options.input = "models/voxc/si_train/full_train/si_voxc_res15_0.1s_full.pt"
options.model = "res15"
model = init_resnet(options)
lda = pickle.load(open("models/lda/si_voxc_res15_0.1s_full_lda_1.pkl", "rb"))

{'use_dilation': True, 'n_layers': 13, 'n_feature_maps': 45}
models/voxc/si_train/full_train/si_voxc_res15_0.1s_full.pt is loaded


In [12]:
options.input = "models/commands/up_from_scratch.pt"
model = init_protonet(options, small=True)
lda = pickle.load(open("models/lda/up_from_scratch_lda.pkl", "rb"))

models/commands/up_from_scratch.pt is loaded


## Similarities From Random Sampling

In [13]:
def decision_on_thres(pos_scores, neg_scores, thres):
    fa_count = np.count_nonzero(neg_scores > thres)
    fr_count = np.count_nonzero(pos_scores < thres)
    print(fa_count, fr_count)
    fpr = fa_count / len(neg_scores)
    fnr = fr_count / len(pos_scores)
    return fpr, fnr

### Word Separated

In [16]:
options.num_query_val = 3
options.num_support_val = 3
options.classes_per_it_val = 1
options.iterations = 100

In [17]:
words = list(data_df.sent.value_counts().index[:10])
words

['stop', 'seven', 'yes', 'zero', 'up', 'no', 'two', 'four', 'go', 'one']

In [20]:
pos_scores_list = []
neg_scores_list = []
for word in ['stop', 'seven', 'yes', 'zero', 'up', 'no', 'two', 'four', 'go', 'one']:
    options.val_manifest = "manifests/commands/words/sv/sv_{}_manifest.csv".format(word)
    val_dataloader = init_sv_loaders(options)
    pos_scores, neg_scores = similarities(options, val_dataloader, model, lda) # embeddings: sample x emb_size
    pos_scores_list.append(pos_scores)
    neg_scores_list.append(neg_scores)
    print(word)
    compute_eer(pos_scores, neg_scores)
    
pos_scores = np.array(pos_scores_list).flatten()
neg_scores = np.array(neg_scores_list).flatten()
print("uni")
compute_eer(pos_scores, neg_scores)    

stop
eer:0.0833, thres:0.5139
seven
eer:0.0333, thres:0.4702
yes
eer:0.0600, thres:0.4643
zero
eer:0.0567, thres:0.4706
up
eer:0.1833, thres:0.2466
no
eer:0.0467, thres:0.5501
two
eer:0.0133, thres:0.4405
four
eer:0.0233, thres:0.5538
go
eer:0.0333, thres:0.4877
one
eer:0.0467, thres:0.4074
uni
eer:0.0570, thres:0.4544


In [1]:
word = "stop"
options.val_manifest = "manifests/commands/words/sv/sv_{}_manifest.csv".format(word)
val_dataloader = init_sv_loaders(options)
pos_scores, neg_scores = similarities(options, val_dataloader, model) # embeddings: sample x emb_size

decision_on_thres(pos_scores, neg_scores, 0.4497)

NameError: name 'options' is not defined

### Word Not Separated

In [93]:
options.num_query_val = 2
options.num_support_val = 4
options.classes_per_it_val = 1

In [94]:
options.val_manifest = "manifests/commands/words/sv/sv_uni_manifest.csv"
val_dataloader = init_sv_loaders(options)

pos_scores, neg_scores = similarities(options, val_dataloader, model) # embeddings: sample x emb_size
score_vector = np.concatenate([pos_scores, neg_scores])
label_vector = np.concatenate([np.ones(len(pos_scores)), np.zeros(len(neg_scores))])
fpr, tpr, thres = roc_curve(label_vector, score_vector, pos_label=1)
eer = 1-tpr[np.nanargmin(np.abs(fpr - (1 - tpr)))]
thres = thres[np.nanargmin(np.abs(fpr - (1 - tpr)))]
print("[uni] eer:{:.4f}, thres:{:.4f}".format(eer, thres))

[uni] eer:0.1350, thres:0.5927


In [85]:
len(score_vector)

200

## Similarities From Defined Set

In [57]:
import torch
from torch.autograd import Variable

def embeds(opt, val_dataloader, model, lda=None):
    val_iter = iter(val_dataloader)
    model.eval()
    splice_dim = opt.splice_frame
    embeddings = []
    labels = []
    for batch in (val_iter):
        x, y = batch
        time_dim = x.size(2)
        split_points = range(0, time_dim-splice_dim+1, splice_dim//2)
        model_outputs = []
        for point in split_points:
            x_in = Variable(x.narrow(2, point, splice_dim))
            if opt.cuda:
                x_in = x_in.cuda()
            model_outputs.append(model.embed(x_in).cpu().data)
        model_output = torch.stack(model_outputs, dim=0)
        model_output = model_output.mean(0)
        if lda is not None:
            model_output = torch.from_numpy(lda.transform(model_output.numpy()).astype(np.float32))
        else:
            model_output = model_output.cpu().data
        embeddings.append(model_output)
        labels.append(y.numpy())
    embeddings = torch.cat(embeddings)
    labels = np.hstack(labels)
    return embeddings, labels

### Word Separated

In [167]:
word = "up"
sv_df = data_df[data_df.sent == word]
spk_counts = sv_df.spk.value_counts()
enroll_candidate = list(spk_counts[spk_counts > 5].index)
enroll_spks = np.random.choice(enroll_candidate,size=(5,), replace=False)

In [170]:
spk_models = dict()
pos_scores = dict()
neg_scores = dict()
n_enroll = 3
n_query = 10
for spk in enroll_spks:
    enroll_df = sv_df[sv_df.spk == spk][:n_enroll+n_query]
    val_dataloader = init_embed_loaders(options, enroll_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    spk_models[spk] = embeddings.mean(0, keepdim=True)
       
    val_dataloader = init_embed_loaders(options, enroll_df[n_enroll:])
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    pos_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])
    
    neg_test_df = sv_df[~sv_df.spk.isin(enroll_spks)].sample(n=n_enroll+n_query)
    val_dataloader = init_embed_loaders(options, neg_test_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    neg_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])

for spk in enroll_spks:
    print(spk)
    compute_eer(pos_scores[spk], neg_scores[spk])
print("uni")
compute_eer(torch.cat([v for v in pos_scores.values()]), torch.cat([v for v in neg_scores.values()]))        

c120e80e
eer:0.0000, thres:0.8331
c1d39ce8
eer:0.0769, thres:0.6132
9a7c1f83
eer:0.0000, thres:0.5213
cb2929ce
eer:0.0000, thres:0.7194
28ce0c58
eer:0.0000, thres:0.4437
uni
eer:0.0462, thres:0.4479


In [146]:
enroll_df

Unnamed: 0,spk,sent,file,set,label
up4c8417710,4c841771,up,4c841771_nohash_0.wav,test,116
up4c8417711,4c841771,up,4c841771_nohash_1.wav,test,116
up4c8417712,4c841771,up,4c841771_nohash_2.wav,test,116
up4c8417714,4c841771,up,4c841771_nohash_4.wav,test,116
up4c8417713,4c841771,up,4c841771_nohash_3.wav,test,116
up4c8417715,4c841771,up,4c841771_nohash_5.wav,test,116


### Word Not Separated

In [164]:
uttrs_counts = data_df.spk.value_counts()
valid_spks = list(uttrs_counts[uttrs_counts > 20].index)
enroll_spks = np.random.choice(valid_spks, size=(5,), replace=False)

In [166]:
spk_models = dict()
pos_scores = dict()
neg_scores = dict()
n_enroll = 10
n_query = 10
for spk in enroll_spks:
    enroll_df = data_df[data_df.spk == spk].sample(n=n_enroll+n_query)
    val_dataloader = init_embed_loaders(options, enroll_df[:n_enroll])
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    spk_models[spk] = embeddings.mean(0, keepdim=True)
       
    val_dataloader = init_embed_loaders(options, enroll_df[n_enroll:])
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    pos_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])
    
    neg_test_df = data_df[~data_df.spk.isin(enroll_spks)].sample(n=n_enroll+n_query)
    val_dataloader = init_embed_loaders(options, neg_test_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    neg_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])

for spk in enroll_spks:
    print(spk)
    compute_eer(pos_scores[spk], neg_scores[spk])
print("uni")
compute_eer(torch.cat([v for v in pos_scores.values()]), torch.cat([v for v in neg_scores.values()])) 

0e17f595
eer:0.0000, thres:0.4552
65c73b55
eer:0.0000, thres:0.4584
364f979f
eer:0.0000, thres:0.5204
50ed8a7b
eer:0.0000, thres:0.6277
cab100c9
eer:0.0000, thres:0.7718
uni
eer:0.0600, thres:0.4778


### Word Recognization Fail

#### speaker model

In [53]:
word = "stop"
sv_df = data_df[data_df.sent == word]
spk_counts = sv_df.spk.value_counts()
enroll_candidate = list(spk_counts[spk_counts > 3].index)
enroll_spks = np.random.choice(enroll_candidate,size=(10,), replace=False)
test_spks = np.random.choice(list(set(enroll_candidate)-set(enroll_spks)),size=(10,), replace=False)

In [54]:
spk_models = dict()
for spk in enroll_spks:
    enroll_df = sv_df[sv_df.spk == spk][:2]
    val_dataloader = init_embed_loaders(options, enroll_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    spk_models[spk] = embeddings.mean(0, keepdim=True)

#### test with other word

In [55]:
word = "right"
test_df = data_df[data_df.sent == word]
test_spks = np.random.choice(list(set(test_df.spk.unique())-set(enroll_spks)),size=(10,), replace=False)

In [56]:
pos_scores = dict()
neg_scores = dict()
for spk in enroll_spks:
    pos_test_df = test_df[test_df.spk == spk]    
    val_dataloader = init_embed_loaders(options, pos_test_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    pos_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])
    
    neg_test_df = test_df[test_df.spk.isin(test_spks)].sample(n=2*len(pos_test_df))
    val_dataloader = init_embed_loaders(options, neg_test_df)
    embeddings, _ = embeds(options, val_dataloader, model, lda)
    neg_scores[spk] = F.cosine_similarity(embeddings, spk_models[spk])

for spk in enroll_spks:
    print(spk)
    compute_eer(pos_scores[spk], neg_scores[spk])
print("uni")
compute_eer(torch.cat([v for v in pos_scores.values()]), torch.cat([v for v in neg_scores.values()]))        

RuntimeError: seq can't be empty

### Audio Check

In [128]:
from pydub import AudioSegment

In [130]:
AudioSegment.from_wav("/home/muncok/DL/dataset/SV_sets/speech_commands/vad/up/c1d39ce8_nohash_2.wav")

In [114]:
AudioSegment.from_wav("/home/muncok/DL/dataset/SV_sets/speech_commands/vad/up/c1d39ce8_nohash_0.wav")