In [None]:
rcParams['figure.figsize'] = (16, 4) #wide graphs by default

# Segmentation

## Structural segmentation

Tzanetakis, G., & Cook, P. (1999). Multifeature audio segmentation for browsing and annotation. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics, 1–4. Retrieved from http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=810860

In [None]:
from essentia.streaming import *

In [None]:
sr = 44100
loader = MonoLoader(filename = 'sources/Dire Straits - Walk of life.mp3', sampleRate=sr)
frameCutter = FrameCutter(frameSize = 1024, hopSize = 512)
w = Windowing(type = 'hann')
spec = Spectrum()
mfcc = MFCC()

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum

In [None]:
pool = essentia.Pool()

mfcc.bands >> (pool, 'lowlevel.mfcc_bands')
mfcc.mfcc >> (pool, 'lowlevel.mfcc')

In [None]:
essentia.run(loader)

In [None]:
imshow(pool['lowlevel.mfcc'].T[1:,:], aspect = 'auto', interpolation='nearest')

In [None]:
essentia.reset(loader)

In [None]:
loader.inputNames(), loader.outputNames()

In [None]:
frameCutter.inputNames(), frameCutter.outputNames()

In [None]:
frameCutter.connections

In [None]:
loader

In [None]:
loader.audio

In [None]:
frameCutter.signal

In [None]:
w

In [None]:
w.frame

In [None]:
frameCutter.frame

In [None]:
frameCutter.connections[frameCutter.frame]

In [None]:
w.frame in frameCutter.connections[frameCutter.frame]

We can change parameters for any *algorithm* in the processing chain:

In [None]:
loader.configure(filename='sources/Bob Marley - Buffalo Soldier.mp3')

In [None]:
essentia.run(loader)

In [None]:
imshow(pool['lowlevel.mfcc'].T[1:,:], aspect = 'auto', interpolation='nearest')

If we hadn't adjusted the loader, we would have had to call:

    essentia.reset(loader)

Because the file reader would be at the end of the file.

## Using essentia to calculate texture windows

In [None]:
sr = 22050
frameSize = 1024
hopSize = 512

loader = MonoLoader(filename = 'sources/Dire Straits - Walk of life.mp3', sampleRate=sr)
frameCutter = FrameCutter(frameSize = frameSize, hopSize = hopSize)
w = Windowing(type = 'hann')
spec = Spectrum()
mfcc = MFCC()
centroid = Centroid()

pool = essentia.Pool()

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum
spec.spectrum >> centroid.array

mfcc.mfcc >> (pool, 'lowlevel.mfcc')
centroid.centroid >> (pool, 'lowlevel.centroid')

Common error: If I try this again:

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> mfcc.spectrum
spec.spectrum >> centroid.array

mfcc.mfcc >> (pool, 'lowlevel.mfcc')
centroid.centroid >> (pool, 'lowlevel.centroid')

Oops... Need to clear the connections first. The easiest way is just to recreate the object (Python will do the garbage collection for you)

In [None]:
sr = 22050
frameSize = 1024
hopSize = 512

loader = MonoLoader(filename = 'sources/Dire Straits - Walk of life.mp3', sampleRate=sr)
frameCutter = FrameCutter(frameSize = frameSize, hopSize = hopSize)
w = Windowing(type = 'hann')
spec = Spectrum()
centroid = Centroid()
rolloff = RollOff()
flux = Flux()
zcr = ZeroCrossingRate()
rms = RMS()

In [None]:
# Texture windows
textureTime = 1.0 # seconds
textureSize = int(textureTime * sr/float(hopSize))
textureCutter = FrameCutter(frameSize = textureSize, hopSize = textureSize)

pool = essentia.Pool()

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> centroid.array
spec.spectrum >> rolloff.spectrum
spec.spectrum >> flux.spectrum

frameCutter.frame >> zcr.signal
frameCutter.frame >> rms.array

centroid.centroid >> (pool, 'lowlevel.centroid')
rolloff.rollOff >> (pool, 'lowlevel.rolloff')
flux.flux >> (pool, 'lowlevel.flux')
zcr.zeroCrossingRate >> (pool, 'lowlevel.zcr')
rms.rms >> (pool, 'lowlevel.rms')

In [None]:
essentia.run(loader)

In [None]:
plot(pool['lowlevel.centroid'])

In [None]:
plot(pool['lowlevel.rms'])

## Texture windows

In [None]:
sr = 44100
frameSize = 1024
hopSize = 512

loader = MonoLoader(filename = 'sources/Stevie Wonder - Superstition.mp3', sampleRate=sr)
frameCutter = FrameCutter(frameSize = frameSize, hopSize = hopSize)
w = Windowing(type = 'hann')
spec = Spectrum()
centroid = Centroid()
rolloff = RollOff()
flux = Flux()
zcr = ZeroCrossingRate()
rms = RMS()

In [None]:
# Texture windows
textureTime = 1.0 # seconds
textureSize = int(textureTime * sr/float(hopSize))
textureWindowCutters = []
textureWindowMeans = []
textureWindowVars = []

for i in range(5):
    textureWindowCutters.append(FrameCutter(frameSize = textureSize, hopSize = textureSize))
    textureWindowMeans.append(Mean())
    textureWindowVars.append(Variance())
    
pool = essentia.Pool()

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> centroid.array
spec.spectrum >> rolloff.spectrum
spec.spectrum >> flux.spectrum

frameCutter.frame >> zcr.signal
frameCutter.frame >> rms.array

centroid.centroid >> (pool, 'lowlevel.centroid')
rolloff.rollOff >> (pool, 'lowlevel.rolloff')
flux.flux >> (pool, 'lowlevel.flux')
zcr.zeroCrossingRate >> (pool, 'lowlevel.zcr')
rms.rms >> (pool, 'lowlevel.rms')


Now the texture windows:

In [None]:
centroid.centroid >> textureWindowCutters[0].signal
rolloff.rollOff >> textureWindowCutters[1].signal
flux.flux >> textureWindowCutters[2].signal
zcr.zeroCrossingRate >> textureWindowCutters[3].signal
rms.rms >> textureWindowCutters[4].signal

features = ['lowlevel.centroid', 'lowlevel.rolloff', 'lowlevel.flux', 'lowlevel.zcr', 'lowlevel.rms']

for i in range(5):
    textureWindowCutters[i].frame >> textureWindowMeans[i].array
    textureWindowCutters[i].frame >> textureWindowVars[i].array
    textureWindowMeans[i].mean >> (pool, '%s_mean'%features[i])
    textureWindowVars[i].variance >> (pool, '%s_var'%features[i])

In [None]:
essentia.run(loader)

In [None]:
plot(pool['lowlevel.rms'])
plot(pool['lowlevel.rms_mean'])

In [None]:
dur = 1 # get right duration!
rms = pool['lowlevel.rms']
rms_mean = pool['lowlevel.rms_mean']
plot(linspace(0, dur, len(rms)), rms)
plot(linspace(0, dur, len(rms_mean)), rms_mean, lw=3)

In [None]:
dur = 1
rms = pool['lowlevel.rms']
rms_mean = pool['lowlevel.rms_mean']
rms_var = pool['lowlevel.rms_var']
plot(linspace(0, dur, len(rms)), rms)
plot(linspace(0, dur, len(rms_mean)), rms_mean, lw=3)
twinx()
plot(linspace(0, dur, len(rms_var)), rms_var, lw=3, color='r')

In [None]:
all_features = []
for ft in features:
    all_features.append(ft+'_mean')
    all_features.append(ft+'_var')

In [None]:
feat_vectors = array( [pool[feat_vector_name] for feat_vector_name in all_features] ,dtype=float)
feat_vectors.shape

## Euclidean distance

In [None]:
from scipy.spatial.distance import euclidean

In [None]:
feat_vect_frame = feat_vectors[:,0]
feat_vect_frame

In [None]:
euclidean(feat_vectors[:,0], feat_vectors[:,1])

In [None]:
euclidean(feat_vectors[:,0], feat_vectors[:,0])

In [None]:
euc_distances = []
for i in range(feat_vectors.shape[1] - 1):
    cdist = euclidean(feat_vectors[:,i], feat_vectors[:,i+1])
    euc_distances.append(cdist)

In [None]:
plot(euc_distances)

In [None]:
plot(diff(euc_distances))

In [None]:
diff_euc = diff(euc_distances)
euc_peaks = argwhere(diff_euc>0.2e7)

plot(diff_euc)
plot(euc_peaks, diff_euc[euc_peaks], 'o')

In [None]:
rms = pool['lowlevel.rms']
dur = (hopSize*len(rms))/float(sr)
plot(linspace(0, dur, len(rms)), rms)
vlines(euc_peaks[:,0], -0.05, 0.3)

for peak in euc_peaks[:,0]:
    text(peak, 0.31, '%.1f'%peak)

## Cosine distance 

http://en.wikipedia.org/wiki/Cosine_distance

Measures similarity in orientation (multidimensional) but not in magnitude

In [None]:
from scipy.spatial.distance import cosine

In [None]:
cosine(feat_vectors[:,0], feat_vectors[:,1])

In [None]:
cosine(feat_vectors[:,0], feat_vectors[:,0])

In [None]:
cos_distances = []
for i in range(feat_vectors.shape[1] - 1):
    cdist = cosine(feat_vectors[:,i], feat_vectors[:,i+1])
    cos_distances.append(cdist)
    

In [None]:
plot(cos_distances)

In [None]:
plot(diff(cos_distances))

In [None]:
diff_cos = diff(cos_distances)
cos_peaks = argwhere(diff_cos>0.000008)

plot(diff_cos)
plot(cos_peaks, diff_cos[cos_peaks], 'o')

In [None]:
cos_peaks

In [None]:
rms = pool['lowlevel.rms']
dur = (hopSize*len(rms))/float(sr)
plot(linspace(0, dur, len(rms)), rms)
vlines(cos_peaks[:,0], -0.05, 0.3)

for peak in cos_peaks[:,0]:
    text(peak, 0.31, '%.1f'%peak)

In [None]:
dur

## Mahalanobis distance

http://en.wikipedia.org/wiki/Mahalanobis_distance

In [None]:
from scipy.spatial.distance import mahalanobis

10 feature vectors per analysis frame:

In [None]:
feat_vectors[:,1].reshape(10,1)

In [None]:
covmat = cov(feat_vectors)
covmat

In [None]:
invcov = inv(covmat)
invcov

In [None]:
mahalanobis(feat_vectors.T[0].T, feat_vectors.T[1], invcov)

In [None]:
mahalanobis(feat_vectors.T[0].T, feat_vectors.T[0], invcov)

In [None]:
mah_distances = []
for i in range(feat_vectors.shape[1] - 1):
    cdist = mahalanobis(feat_vectors[:,i], feat_vectors[:,i+1], invcov)
    mah_distances.append(cdist)
    

In [None]:
plot(mah_distances)

In [None]:
diff_mah = diff(mah_distances)
mah_peaks = argwhere(diff_mah>2.5)

plot(diff_mah)
plot(mah_peaks, diff_mah[mah_peaks], 'o')

In [None]:
rms = pool['lowlevel.rms']
dur = (hopSize*len(rms))/float(sr)
plot(linspace(0, dur, len(rms)), rms)
vlines(mah_peaks[:,0], -0.05, 0.3)

for peak in mah_peaks[:,0]:
    text(peak, 0.31, '%.1f'%peak)

Now all results:

In [None]:
rms = pool['lowlevel.rms']
dur = (hopSize*len(rms))/float(sr)
plot(linspace(0, dur, len(rms)), rms, alpha=0.2)

vlines(mah_peaks[:,0], -0.05, 0.25, 'r', lw=3)
for peak in mah_peaks[:,0]:
    text(peak, 0.26, '%.1f'%peak, color='red')
    

vlines(cos_peaks[:,0], -0.05, 0.3, 'g', lw=3)
for peak in cos_peaks[:,0]:
    text(peak, 0.31, '%.1f'%peak, color='g')
    
vlines(euc_peaks[:,0], -0.05, 0.3, 'b', lw=3)
for peak in euc_peaks[:,0]:
    text(peak, 0.35, '%.1f'%peak, color='g')

There are may other ways of calculating vector distance:

http://docs.scipy.org/doc/scipy/reference/spatial.distance.html

http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise

How can this segmentation metric be improved?

*Hint: How does this relate to the self-similarity matrix?*

## Event segmentation

In [None]:
sr = 44100
loader = MonoLoader(filename = 'sources/superstition.wav', sampleRate=sr)

In [None]:
loader.audio

In [None]:
pool = essentia.Pool()

In [None]:
loader.audio >> (pool, "samples")

In [None]:
essentia.run(loader)

In [None]:
plot(pool['samples']);

In [None]:
rhythmext = RhythmExtractor2013()
loader.audio >> rhythmext.signal
rhythmext.ticks >> (pool, 'rhythm.ticks')
rhythmext.bpm >> (pool, 'rhythm.bpm')
rhythmext.confidence >> (pool, 'rhythm.confidence')
rhythmext.estimates >> (pool, 'rhythm.estimates')
rhythmext.bpmIntervals >> (pool, 'rhythm.bpmIntervals')

In [None]:
essentia.reset(loader)
pool.clear()

In [None]:
essentia.run(loader)

In [None]:
pool['rhythm.ticks']

In [None]:
pool['rhythm.bpm']

In [None]:
dur = len(pool['samples'].flat)/float(sr)
plot(linspace(0, dur, len(pool['samples'].flat)), pool['samples'].flat);

plot(pool['rhythm.ticks'], zeros_like(pool['rhythm.ticks']), 'o')

In [None]:
frameSize = 1024
hopSize = 256
spec = Spectrum()
onsetdetect = OnsetDetection(method='flux')
frameCutter = FrameCutter(frameSize = frameSize, hopSize = hopSize)
w = Windowing(type = 'hann')

In [None]:
loader.audio >> frameCutter.signal
frameCutter.frame >> w.frame >> spec.frame
spec.spectrum >> onsetdetect.spectrum
spec.spectrum >> onsetdetect.phase
onsetdetect.onsetDetection >> (pool, 'onsetDetection')

In [None]:
essentia.reset(loader)
pool.clear()
essentia.run(loader)

In [None]:
plot(pool['onsetDetection'])

In [None]:
diff_onsets = diff(pool['onsetDetection'])
plot(diff_onsets)

In [None]:
onsets = argwhere(diff_onsets > 0.1)
plot(diff_onsets)
plot(onsets, zeros_like(onsets), 'o')

TODO:

* Filter out onsets that are too close
* Then segment and find similarity between each slice

More todo:

* Use checkerboard kernel with self-similarity matrix

Foote, J. (2000). Automatic audio segmentation using a measure of audio novelty. Multimedia and Expo, 2000. ICME 2000. 2000 IEEE …, 1, 452–455. Retrieved from http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=869637

In [None]:
from scipy.ndimage.filters import gaussian_filter
kernel = zeros((65, 65))
kernel[32,32] = 1
kernel = gaussian_filter(kernel, 16)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = figure()
ax = Axes3D(fig)
X = arange(65)
Y = arange(65)
X, Y = meshgrid(X, Y)

ax.plot_surface(X, Y, kernel, rstride=1, cstride=1, cmap=cm.hot)

In [None]:
checkerboard = array(r_[ones(33), -ones(32)])

for i in range(32):
    checkerboard = column_stack((checkerboard, r_[ones(33), -ones(32)]))
    
for i in range(32):
    checkerboard = column_stack((checkerboard, r_[-ones(32), ones(33)]))

In [None]:
kernel*checkerboard

In [None]:
fig = figure()
ax = Axes3D(fig)
X = arange(65)
Y = arange(65)
X, Y = meshgrid(X, Y)

ax.plot_surface(X, Y, kernel*checkerboard, rstride=1, cstride=1, cmap=cm.hot)

By: Andrés Cabrera mantaraya36@gmail.com

For Course MAT 240E at UCSB

This ipython notebook is licensed under the CC-BY-NC-SA license: http://creativecommons.org/licenses/by-nc-sa/4.0/

![http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png](http://i.creativecommons.org/l/by-nc-sa/3.0/88x31.png)