# Demo of creating Embedding without WebRTC VAD.

* Make sure you've finished the training and have a pretrained model.

In [1]:
import os
import torch
import librosa
from IPython.display import Audio, display
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [12]:
# some parameters copied from config/config.yaml
tisv_frame = 180
sr = 16000
hop = 0.01
window = 0.025
nfft = 512
nmels = 40
# >>>>> Change the path below to your pretrain model path. <<<
model_path = "speech_id_checkpoint/final_epoch_950_batch_id_141.model"

We take `p225_004.wav` in VCTK dataset as an example to show, how to generate `Embedding` from an utterance with only using `librosa` and the pretrained-model.

In [13]:
print("Sample audio:")
content_audio = 'res/p225_004.wav'
display(Audio(content_audio))

Sample audio:


In [14]:
c, sr = librosa.core.load(content_audio, sr) 
intervals = librosa.effects.split(c, top_db=30)
print(len(intervals))

utter_min_len = (tisv_frame * hop + window) * sr
print('utter_min_len={}'.format(utter_min_len))
utterances_spec = []
for interval in intervals:
    if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
        utter_part = c[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
        S = librosa.core.stft(y=utter_part, n_fft=nfft,
                              win_length=int(window * sr), hop_length=int(hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, n_mels=nmels)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
        utterances_spec.append(S[:, :tisv_frame])    # first 180 frames of partial utterance
        utterances_spec.append(S[:, -tisv_frame:])
        
utterances_spec = np.array(utterances_spec)
print(utterances_spec.shape)
np.save('res/content.npy', utterances_spec)

4
utter_min_len=29200.0
(2, 40, 180)


In [15]:
utterances_spec = np.stack(utterances_spec, axis=2)
print(utterances_spec.shape)
utterances_spec = torch.tensor(np.transpose(utterances_spec, axes=(2,1,0)))
print(utterances_spec.shape)

(40, 180, 2)
torch.Size([2, 180, 40])


In [16]:
from speech_embedder_net import SpeechEmbedder
embedder_net = SpeechEmbedder()
embedder_net.load_state_dict(torch.load(model_path))
embedder_net.eval()

SpeechEmbedder(
  (LSTM_stack): LSTM(40, 768, num_layers=3, batch_first=True)
  (projection): Linear(in_features=768, out_features=256, bias=True)
)

In [17]:
def align_embeddings(embeddings):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    avg_embeddings = np.zeros((len(partitions),256))
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0) 
    return avg_embeddings

### Generate Embedding

In [19]:
embeddings = embedder_net(utterances_spec)
print(embeddings.shape)
aligned_embeddings = align_embeddings(embeddings.detach().numpy())
print(aligned_embeddings.shape)
print(aligned_embeddings)

torch.Size([2, 256])
(1, 256)
[[ 8.71910900e-02 -3.13968770e-02 -6.36109635e-02  7.27164745e-02
   5.67372516e-02  1.19385287e-01  1.04235753e-01 -8.48884229e-03
  -5.61310351e-02 -2.03626920e-02  9.87719186e-03 -5.06463498e-02
   2.91137435e-02 -3.59785035e-02  5.62160909e-02 -6.47973418e-02
  -7.57161528e-02 -2.71101985e-02  5.27440310e-02  2.99151205e-02
   1.30088761e-01  5.88106625e-02  1.43618155e-02  4.34024483e-02
   5.35420887e-02  7.33713508e-02  7.07995221e-02  6.85613900e-02
  -1.37788197e-02 -4.11791317e-02 -4.92333397e-02  3.46364602e-02
   1.46176685e-02  6.82284236e-02 -4.19394486e-02  5.57754785e-02
  -1.72375795e-02 -7.00006783e-02  3.28584313e-02 -5.05386591e-02
  -1.14821335e-02 -6.87019108e-03  5.49964532e-02  6.11773171e-02
  -6.53837025e-02  5.91993779e-02  7.21149147e-02 -9.64607894e-02
  -3.08523960e-02 -5.49082551e-03  4.50872295e-02  3.47407609e-02
  -5.43130375e-02 -7.17208236e-02  1.94872133e-02 -4.14520465e-02
  -3.05316169e-02 -5.92305623e-02 -9.45103168e