# <center> PROCESAMIENTO DIGITAL DE SEÑALES DE AUDIO</center>
## <center> Dynamic Time Warping (DTW)</center>
### <center> Music alignment example</center>

Code based on: [librosa_gallery](https://librosa.github.io/librosa/auto_examples/plot_music_sync.html#sphx-glr-auto-examples-plot-music-sync-py)

In [None]:
%matplotlib inline

import numpy as np
import scipy, scipy.spatial 
import matplotlib
import matplotlib.pyplot as plt

import librosa
import librosa.display

import IPython.display as ipd

**NOTA:** *Las siguientes dos celdas solo son necesarias para descargar el archivo de ejemplo. Ignórelas si va a trabajar con sus propios archivos de audio.*

In [None]:
!pip install wget

In [None]:
import wget

### Descripción

En este ejemplo se estudia el alineamiento mediante Dynamic Time Warping (DTW) de dos señales de música usando como representación el cromagrama. 

### Cómo correr el notebook
Se puede bajar y correr el notebook de forma local en una computadora.

O también se puede correr en Google Colab usando el siguiente enlace. 

<table align="center">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/mrocamora/audio-dsp/blob/main/notebooks/audioDSP-dtw_music_example.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table>

### Señales de audio

In [None]:
# download audio files
wget.download('https://github.com/mrocamora/audio-dsp/blob/main/audio/sir_duke_slow.mp3?raw=true')
wget.download('https://github.com/mrocamora/audio-dsp/blob/main/audio/sir_duke_fast.mp3?raw=true')

In [None]:
x_1, fs = librosa.load('sir_duke_slow.mp3')
plt.figure(figsize=(16, 4))
librosa.display.waveplot(x_1, sr=fs)
plt.title('Slower Version $X_1$')
plt.tight_layout()

In [None]:
ipd.Audio(x_1, rate=fs)

In [None]:
x_2, fs = librosa.load('sir_duke_fast.mp3')
plt.figure(figsize=(16, 4))
librosa.display.waveplot(x_2, sr=fs)
plt.title('Faster Version $X_2$')
plt.tight_layout()

In [None]:
ipd.Audio(x_2, rate=fs)

Listen to the two recordings toghether to verify they are not synchronized.

In [None]:
ml = np.min([len(x_1), len(x_2)])
audio_1_s = x_1[:ml]
audio_2_s = x_2[:ml]
audio_stereo = np.hstack((audio_2_s.reshape(-1, 1), audio_1_s.reshape(-1, 1)))

print('Not synchronized versions', flush=True)

In [None]:
ipd.Audio(audio_stereo.T, rate=fs)

### Chrome features

In [None]:
hop_size = 256

x_1_chroma = librosa.feature.chroma_stft(y=x_1, sr=fs, tuning=0, norm=2,
                                         hop_length=hop_size)
x_2_chroma = librosa.feature.chroma_stft(y=x_2, sr=fs, tuning=0, norm=2,
                                         hop_length=hop_size)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(2, 1, 1)
plt.title('Chroma Representation of $X_1$')
librosa.display.specshow(x_1_chroma, x_axis='time',
                         y_axis='chroma', cmap='coolwarm', hop_length=hop_size)
plt.colorbar()
plt.subplot(2, 1, 2)
plt.title('Chroma Representation of $X_2$')
librosa.display.specshow(x_2_chroma, x_axis='time',
                         y_axis='chroma', cmap='coolwarm', hop_length=hop_size)
plt.colorbar()
plt.tight_layout()


## DTW functions



In [None]:
def dtw_table(x, y, distance=None):
    if distance is None:
        distance = scipy.spatial.distance.euclidean
    nx = len(x)
    ny = len(y)
    table = np.zeros((nx+1, ny+1))
    
    # Compute left column separately, i.e. j=0.
    table[1:, 0] = np.inf
        
    # Compute top row separately, i.e. i=0.
    table[0, 1:] = np.inf
        
    # Fill in the rest.
    for i in range(1, nx+1):
        for j in range(1, ny+1):
            d = distance(x[i-1], y[j-1])
            table[i, j] = d + min(table[i-1, j], table[i, j-1], table[i-1, j-1])
    return table

In [None]:
def dtw(x, y, table):
    i = len(x)
    j = len(y)
    path = [(i, j)]
    while i > 0 or j > 0:
        minval = np.inf
        if table[i-1][j-1] < minval:
            minval = table[i-1, j-1]
            step = (i-1, j-1)
        if table[i-1, j] < minval:
            minval = table[i-1, j]
            step = (i-1, j)
        if table[i][j-1] < minval:
            minval = table[i, j-1]
            step = (i, j-1)
        path.insert(0, step)
        i, j = step
    return np.array(path)


## Align chroma sequences


In [None]:
D = dtw_table(x_1_chroma.T, x_2_chroma.T, distance=scipy.spatial.distance.cityblock)

path = dtw(x_1_chroma.T, x_2_chroma.T, D)

In [None]:
# plot the best path on top of local similarity matrix
plt.figure(figsize=(9, 8))

# bottom right plot
ax1 = plt.axes([0.2, 0, 0.8, 0.20])
ax1.imshow(x_1_chroma, origin='lower', aspect='auto', cmap='coolwarm')
ax1.set_xlabel('Chroma Representation of $X_1$')
ax1.set_xticks([])
ax1.set_yticks([])

# top left plot
ax2 = plt.axes([0, 0.2, 0.20, 0.8])
ax2.imshow(x_2_chroma.T[:,::-1], origin='lower', aspect='auto', cmap='coolwarm')
ax2.set_ylabel('Chroma Representation of $X_2$')
ax2.set_xticks([])
ax2.set_yticks([])

# top right plot
ax3 = plt.axes([0.2, 0.2, 0.8, 0.8])
ax3.imshow(D.T, aspect='auto', origin='lower', interpolation='nearest', cmap='gray')
ax3.set_xticks([])
ax3.set_yticks([])

# path
ax3.plot(path[:,0], path[:,1], 'r')

## Alternative visualization of the alignment

We can also visualize the wariping path directly on the chroma representation of the signals.
Black lines connect corresponding time positions in the chroma representation of the signals.

In [None]:
plt.figure(figsize=(11, 5))

# top plot
ax1 = plt.axes([0, 0.60, 1, 0.40])
ax1.imshow(x_1_chroma, origin='lower', aspect='auto', cmap='coolwarm')
ax1.set_ylabel('Signal 1')
ax1.set_xticks([])
ax1.set_yticks([])
#ax1.set_ylim(20)
ax1.set_xlim(0, x_1_chroma.shape[1])

# bottom plot
ax2 = plt.axes([0, 0, 1, 0.40])
ax2.imshow(x_2_chroma, origin='lower', aspect='auto', cmap='coolwarm')
ax2.set_ylabel('Signal 2')
ax2.set_xticks([])
ax2.set_yticks([])
#ax2.set_ylim(20)
ax2.set_xlim(0, x_2_chroma.shape[1])

# middle plot
line_color = 'k'
step = 30
n1 = float(x_1_chroma.shape[1])
n2 = float(x_2_chroma.shape[1])
ax3 = plt.axes([0, 0.40, 1, 0.20])
for t in path[::step]:
    ax3.plot((t[0]/n1, t[1]/n2), (1, -1), color=line_color)
    ax3.set_xlim(0, 1)
    ax3.set_ylim(-1, 1)
    
# reference marker    
ref_mark = 300
marker_color = 'r'
t = path[ref_mark]
ax3.plot((t[0]/n1, t[1]/n2), (1, -1), color=marker_color)
    
# path markers on top and bottom plot
y1_min, y1_max = ax1.get_ylim()
y2_min, y2_max = ax2.get_ylim()
ax1.vlines([t[0] for t in path[::step]], y1_min, y1_max, color=line_color)
ax2.vlines([t[1] for t in path[::step]], y2_min, y2_max, color=line_color)
t = path[ref_mark]
ax1.vlines(t[0], y1_min, y1_max, color=marker_color)
ax2.vlines(t[1], y2_min, y2_max, color=marker_color)
ax3.set_xticks([])
ax3.set_yticks([])

## Listen to the alignment

Listen to the both recordings at the same alignment marker:




In [None]:
# shape of the alignment
print(path.shape)

# convert frame to samples
i1, i2 = librosa.frames_to_samples(path[ref_mark], hop_length=hop_size)
print(i1, i2)

In [None]:
ipd.Audio(x_1[i1:], rate=fs)

In [None]:
ipd.Audio(x_2[i2:], rate=fs)

In [None]:
plt.figure(figsize=(16, 8))
ax1 = plt.subplot(211)
#librosa.display.waveplot(x_1, sr=fs)
plt.plot(x_1)
ax1.vlines(i1, -0.5, 0.5, color=marker_color)
plt.title('Slower Version $X_1$')
plt.tight_layout()
ax2 = plt.subplot(212)
#librosa.display.waveplot(x_2, sr=fs)
plt.plot(x_2)
ax2.vlines(i2, -0.5, 0.5, color=marker_color)
plt.title('Slower Version $X_1$')
plt.tight_layout()

## Listen to aligned audio

In [None]:
# download audio files
wget.download('https://github.com/mrocamora/audio-dsp/blob/main/audio/stereo_matched_sir_duke.wav?raw=true')

In [None]:
# Load stereo aligned audio
d_align, sr_align = librosa.load('stereo_matched_sir_duke.wav', sr=None)

In [None]:
ipd.Audio(d_align, rate=sr_align)