`Nearest Neighbour`
* Assuming we have been given a list of libraries - recorded voice inputs, how close are they related ?

In [8]:
# import the neccessary libs
from matplotlib.patches import ConnectionPatch
import matplotlib.pyplot as plt
import numpy as np 
import scipy.spatial.distance as dist

In [9]:
from scipy.io import wavfile
import IPython.display as ipyd
import librosa
import librosa.display

In [10]:
# Define the DTW function
def dp(dist_matrix):
    
    N, M = dist_matrix.shape

    # extract dimensionality -> Initialize the cost matrix
    cost_matrix = np.zeros((N+1, M+1))
    for i in range(1, N+1):
        cost_matrix[i,0] = np.inf 
    for i in range(1, M+1):
        cost_matrix[0,i] = np.inf 


    # Flll the cost matrix while keepng the traceback information
    traceback_matrix = np.zeros((N, M))
    for i in range(N):
        for j in range(M):
            penalty = [
                cost_matrix[i, j], # match(0)
                cost_matrix[i, j+1], # insertion(1)
                cost_matrix[i+1, j] # deletion(2)
            ]

            i_penalty = np.argmin(penalty)

            cost_matrix[i+1, j+1] = dist_matrix[i,j]+penalty[i_penalty]
            traceback_matrix[i,j] = i_penalty

    # Let our traceback be from bottom right.
    i = N-1
    j = M-1
    path = [(i,j)]

    while i > 0 or j > 0:
        tb_type = traceback_matrix[i,j]
        if tb_type == 0:
            # match
            i = i-1
            j = j-1
        elif tb_type == 1:
            # insertion
            i = i -1 
        elif tb_type == 2:
            # deletion
            j = j -1 
        path.append((i,j))

    # strip infinity edges before returning
    cost_matrix = cost_matrix[1:, 1:]
    return (path[::-1], cost_matrix)


In [11]:
# load that audio to be investigated

# rec_audio = "../Lecture/audio/hello2.wav"
# rec_audio = "./Gen/gen1.wav"
# rec_audio = "../Lecture/audio/harvard.wav"
rec_audio = "../Lecture/audio/machio.wav"
sampling_freq_rec, metadata_rec = wavfile.read(rec_audio)

n_fft_rec = int(0.025*sampling_freq_rec)
hop_length_rec = int(0.01*sampling_freq_rec)

# Extract features
mel_spec_rec = librosa.feature.melspectrogram(

    y=metadata_rec/1.0,
    sr=sampling_freq_rec,
    n_fft=n_fft_rec,
    hop_length=hop_length_rec,
    n_mels = 40
)
log_mel_spec_rec = np.log(mel_spec_rec)
rec_signal = log_mel_spec_rec.T

In [12]:
ipyd.Audio(rate=sampling_freq_rec, data=metadata_rec)

* Create our custom library where we will search for the words.

In [13]:
# load all audio samples
custom_lib = [
    "../Lecture/audio/bye.wav",
    "../Lecture/audio/cat.wav",
    "../Lecture/audio/goodbye.wav",
    "../Lecture/audio/hello1.wav",
    "../Lecture/audio/hello2.wav",
    "../Lecture/audio/hello3.wav",
    "../Lecture/audio/harvard.wav",
    "../Lecture/audio/machio.wav"
]

In [14]:
# extract features then search - Goal: find the lowest alignment score

for file in custom_lib:
# for file in generator_lib:    

    # Mel scale spect
    print('Reading:', file)
    sampling_freq, metadata = wavfile.read(file)

    hop_length = int(0.01*sampling_freq)
    n_fft = int(0.025*sampling_freq)

    mel_spec = librosa.feature.melspectrogram(
        y = metadata/1.0,
        sr = sampling_freq,
        hop_length = hop_length,
        n_fft = n_fft,
        n_mels = 40
    )

    log_mel_spec = np.log(mel_spec)
    compare_signal = log_mel_spec.T 

    dist_matrix = dist.cdist(rec_signal, compare_signal, "cosine")
    path, cost_matrix = dp(dist_matrix)

    print("Alignment Cost: {:.4f}".format(cost_matrix[-1,-1]))

    N = rec_signal.shape[0]
    M = compare_signal.shape[0]

    print(
        "Normalized Alignment Cost: {:.4f}".format(
            cost_matrix[-1,-1]/(M+N)
        )
    )

    print()

Reading: ../Lecture/audio/bye.wav
Alignment Cost: 525.3690
Normalized Alignment Cost: 0.5850

Reading: ../Lecture/audio/cat.wav
Alignment Cost: 520.7194
Normalized Alignment Cost: 0.6090

Reading: ../Lecture/audio/goodbye.wav
Alignment Cost: 499.3626
Normalized Alignment Cost: 0.5475

Reading: ../Lecture/audio/hello1.wav
Alignment Cost: 521.8954
Normalized Alignment Cost: 0.5864

Reading: ../Lecture/audio/hello2.wav
Alignment Cost: 516.3074
Normalized Alignment Cost: 0.5570

Reading: ../Lecture/audio/hello3.wav
Alignment Cost: 476.9235
Normalized Alignment Cost: 0.4973

Reading: ../Lecture/audio/harvard.wav
Alignment Cost: 533.2376
Normalized Alignment Cost: 0.1061

Reading: ../Lecture/audio/machio.wav
Alignment Cost: 0.0000
Normalized Alignment Cost: 0.0000

