In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import os

In [2]:
import librosa
import librosa.display
import IPython.display as ipd

In [3]:
import sys
sys.path.append("../")

In [6]:
from dnn.parser import ConfigBuilder
import dnn.data.dataset as dset
import dnn.train.model as mod
from dnn.si_train import set_seed
from pydub import AudioSegment

### SI_Model

In [5]:
model = "SimpleCNN"
dataset = "reddots"

global_config = dict(model=model, dataset=dataset,
                     no_cuda=False,  gpu_no=0,
                     n_epochs=100, batch_size=64,
                     lr=[0.01], schedule=[np.inf], dev_every=1, seed=0, use_nesterov=False,
                     cache_size=32768, momentum=0.9, weight_decay=0.00001,
                     num_workers=16, print_step=100,
                     )

builder = ConfigBuilder(
                dset.SpeechDataset.default_config(),
                global_config)
parser = builder.build_argparse()
si_config = builder.config_from_argparse(parser)
si_config['model_class'] = mod.SimpleCNN
set_seed(si_config)

In [6]:
si_model = si_config['model_class']()

In [7]:
si_model.load_partial("models/commands/right_adapted.pt")

models/commands/right_adapted.pt is loaded


In [8]:
from dnn.data.manage_audio import preprocess_from_path
from torch.autograd import Variable
import torch

def embed(config, model, audio_path, splice_sec):                           
    data = preprocess_from_path(config, audio_path)
    if not config["no_cuda"]:                                   
        torch.cuda.set_device(config["gpu_no"])                 
        model.cuda()                                            
    model.eval()                                                

    splice_len = int(splice_sec*100)
    data = torch.split(data, splice_len , dim=1)
    data = torch.stack(data[:-1], dim=0)
    data_in = Variable(data, requires_grad=False)
    if not config["no_cuda"]:                                   
        data_in = data_in.cuda()                                
    feature = model.embed(data_in).cpu().data           
    return feature                                              

### LDA Transformation

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import pickle
def lda_tf(emb, lda_path="models/"):
    lda_model = pickle.load(open("models/lda/si_reddots_0.2s_random_2_lda.pkl", "rb"))
    return torch.from_numpy(lda_model.transform(emb).astype(np.float32))

In [10]:
def cos_sim(a, b, lda=False):
    if lda:
        print("lda")
        print(F.cosine_similarity(lda_tf(a), lda_tf(b)))
    else:
        print(F.cosine_similarity(a, b))


### Voice 

Positive Samples (Same Phrase)

In [71]:
sound1_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0001/20150130084154554_m0001_31.wav"
sound2_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0001/20150130084155412_m0001_31.wav"
sound3_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0001/20150130084156114_m0001_31.wav"
sound1 = AudioSegment.from_file(sound1_file)
sound2 = AudioSegment.from_file(sound2_file)
sound3 = AudioSegment.from_file(sound3_file)

In [72]:
si_config['input_length'] = int(16000*1)
splice_sec = 0.2
sound1_embed = embed(si_config, si_model, sound1_file, splice_sec)
sound2_embed = embed(si_config, si_model, sound2_file, splice_sec)
sound3_embed = embed(si_config, si_model, sound3_file, splice_sec)

In [74]:
isLda = False

In [133]:
cos_sim(sound2_embed, sound1_embed, isLda)
cos_sim(sound2_embed.mean(0, True), sound1_embed, isLda)
cos_sim(sound2_embed.mean(0, keepdim=True), sound1_embed.mean(0, keepdim=True), isLda)


 0.5764
 0.7022
 0.8319
 0.8105
 0.6234
[torch.FloatTensor of size 5]


 0.6371
 0.8398
 0.9137
 0.8662
 0.8483
[torch.FloatTensor of size 5]


 0.9415
[torch.FloatTensor of size 1]



Positive Samples (Different Phrase)

In [87]:
sound4_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0001/20150129213255824_m0001_48.wav"
sound4 = AudioSegment.from_file(sound4_file)
sound4_embed = embed(si_config, si_model, sound4_file, splice_sec)

Positive Samples (Different Phrase)

In [130]:
isLda = False

In [131]:
from scipy.spatial.distance import cosine
import torch.nn.functional as F
import itertools

# for spk1, spk2 in itertools.combinations(enroll_spks,2):
#     score = 1-cosine(spk_models[spk1], spk_models[spk2])
#     print("{}, {}: {:.2f}".format(spk1, spk2, score))

cos_sim(sound2_embed, sound4_embed, isLda)
cos_sim(sound2_embed.mean(0, True), sound4_embed, isLda)
cos_sim(sound2_embed.mean(0, keepdim=True), sound4_embed.mean(0, keepdim=True), isLda)


 0.4508
 0.3643
 0.2171
 0.4376
 0.4484
[torch.FloatTensor of size 5]


 0.4803
 0.4186
 0.3396
 0.4651
 0.5204
[torch.FloatTensor of size 5]


 0.5052
[torch.FloatTensor of size 1]



Negative Samples (Same Phrase)

In [78]:
sound5_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0002/20150129105431142_m0002_31.wav"
sound6_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0002/20150129105601404_m0002_31.wav"
sound5 = AudioSegment.from_file(sound4_file)
sound6 = AudioSegment.from_file(sound5_file)

sound5_embed = embed(si_config, si_model, sound4_file, splice_sec)
sound6_embed = embed(si_config, si_model, sound5_file, splice_sec)

In [81]:
# cos_sim(sound2_embed, sound4_embed))
cos_sim(sound2_embed, sound5_embed, isLda)
cos_sim(sound2_embed.mean(0, True), sound5_embed, isLda)
cos_sim(sound2_embed.mean(0, keepdim=True), sound5_embed.mean(0, keepdim=True), isLda)


 0.4091
 0.3727
 0.5050
 0.4403
 0.4528
[torch.FloatTensor of size 5]


 0.5068
 0.4053
 0.5231
 0.4455
 0.4581
[torch.FloatTensor of size 5]


 0.5715
[torch.FloatTensor of size 1]



Negative Samples (Difference Phrase)

In [22]:
sound7_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0002/20150129105428584_m0002_39.wav"
sound8_file = "/home/muncok/DL/dataset/SV_sets/reddots_r2015q4_v1/wav/m0002/20150129105431766_m0002_61.wav"
sound7 = AudioSegment.from_file(sound6_file)
sound8 = AudioSegment.from_file(sound7_file)
sound7_embed = embed(si_config, si_model, sound6_file, splice_sec)
sound8_embed = embed(si_config, si_model, sound7_file, splice_sec)

In [139]:
cos_sim(sound2_embed, sound7_embed, True)
cos_sim(sound2_embed.mean(0, True), sound7_embed, True)
cos_sim(sound2_embed.mean(0, keepdim=True), sound7_embed.mean(0, keepdim=True), True)

lda

-0.0923
-0.2041
-0.0719
-0.1298
 0.0513
[torch.FloatTensor of size 5]

lda

-0.0961
-0.1669
-0.0554
-0.1500
 0.0149
[torch.FloatTensor of size 5]

lda

-0.1079
[torch.FloatTensor of size 1]



In [140]:
np.mean([-0.0923,
-0.2041,
-0.0719,
-0.1298,
 0.0513])

-0.08936

## SpeechCommand

In [4]:
def audioToembed(audio):
    seg = AudioSegment.from_file(audio)
    emb = embed(si_config, si_model, audio, splice_sec)
    return seg, emb

In [149]:
si_config['input_length'] = int(16000*1)
splice_sec = 0.2

In [161]:
seg1, command1 = audioToembed("/home/muncok/DL/dataset/SV_sets/speech_commands/eight/0132a06d_nohash_1.wav")
seg2, command2 = audioToembed("/home/muncok/DL/dataset/SV_sets/speech_commands/eight/0132a06d_nohash_2.wav")

In [146]:
audioToembed(yes_command1)

In [162]:
seg1

In [160]:
seg2

In [163]:
cos_sim(command1, command2, False)
cos_sim(command1.mean(0,True), command2, False)


 0.5855
 0.6131
 0.5157
 0.4796
 0.5134
[torch.FloatTensor of size 5]


 0.5283
 0.7741
 0.6913
 0.6655
 0.6659
[torch.FloatTensor of size 5]

