In [3]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import scipy.signal
import math
%matplotlib inline

import sys
sys.path.insert(0, '../../scripts')

import stft_zoom, display, detect_musical_regions
from util import *
import mappings
import pickle
import PIL
import IPython.display
from classes import SingleResSpectrogram, MultiResSpectrogram

import timeit
import csv

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [4]:
#do mauricio
sys.path.insert(0, '../../scripts/mauricio_solutions/')
import lukin_todd, swgm, local_sparsity, util_m

In [5]:
from IPython.display import Audio
sound_file = '/Users/nicolas/Downloads/camera.mp3'

### A comparison of computational cost between different TFP representations

#### STFT, CQT, Lukin & Todd, SWGM, SLS x our multiresolution spectrogram

The objective is to evaluate the computational cost (execution CPU time and memory used) for a given max resolution of the representation, and to obtain a cost curve for these different representations.

Parameters of this experiment are:

    - max freq. resolution:  the cost of 86.13, 43.06, 21.53, 10.77, 5.38, 2.69 and 1.34 Hz per bin (the equivalent of using windows of [512, 1024, 2048, 4096, 8192, 16384, 32768] on a 44100 Hz sampled signal) will be evaluated
    - files per data-point: 10 files?
    - CQT parameters: fmin = 20, 10 octaves, resolution to match cents between 100-200Hz
    - Lukin & Todd kernel size same as Mauricio kernel size 
    - Our subregion size: test several
    - Normalization simplified

In [6]:
def hz_to_cents(hz_resolution):
    # Converts from Hz res to cent resolution in the octave from 100-200Hz
    delta_f = 100
    cent_delta_f = delta_f/1200
    return hz_resolution / cent_delta_f

def hz_to_binperoct(hz_resolution):
    return int(np.ceil(1200 / hz_to_cents(hz_resolution)))

In [7]:
def calc_kernel_size(window_lengths, energy=False):
    if energy:
        P = 2
    else:
        P = 5
    f_size = P * window_lengths[-1]/window_lengths[0]
    if not f_size % 2 > 0:
        f_size += 1
    
    return [int(f_size), 5]

def LT(y, window_lengths, kernel):
    specs = util_m.get_spectrograms(y, windows=window_lengths)
    specs = util_m.interpol_and_normalize(specs)
    return lukin_todd.lukin_todd(specs, kernel)

def swgm_time(y, window_lengths):
    specs = util_m.get_spectrograms(y, windows=window_lengths)
    specs = util_m.interpol_and_normalize(specs)
    return swgm.SWGM(specs)

def SLS_time(y, window_lengths, kernel_anal, kernel_energy):
    specs = util_m.get_spectrograms(y, windows=window_lengths)
    specs = util_m.interpol_and_normalize(specs)
    return local_sparsity.smoothed_local_sparsity(specs, kernel_anal, kernel_energy)

def our_solution(y, res, sr=44100):
    n_fft = 2048
    kernel = [500,500]
    spec = np.abs(librosa.stft(y, n_fft=n_fft))
    time_span = [0,len(y)/sr]
    x_axis, y_axis = stft_zoom.get_axes_values(sr, 0, time_span, spec.shape) 
    base_spec = SingleResSpectrogram(spec, x_axis, y_axis)
    multires_spec = MultiResSpectrogram(base_spec)

    indices, original_shape = detect_musical_regions.detect_musical_regions(model, spec, mode='threshold', pct_or_threshold=0.8)
    to_be_refined = detect_musical_regions.musical_regions_to_ranges(indices, original_shape, x_axis, y_axis, kernel)

    stft_zoom.set_signal_bank(y,kernel)

    for subregion in to_be_refined:
        freq_range = subregion[0]
        time_range = subregion[1]
        spec_zoom, x_axis, y_axis, new_sr, window_size, hop_size = stft_zoom.stft_zoom(y, freq_range, time_range, sr=sr, original_window_size=n_fft, k=res)
        refined_subspec = SingleResSpectrogram(spec_zoom, x_axis, y_axis)
        multires_spec.insert_zoom(multires_spec.base_spec, refined_subspec, zoom_level=1)

In [8]:
file_name = '../../data/example.wav'
y, sr = librosa.load(file_name, sr=44100)
y = y[:44100*2]

In [9]:
res_hz = [86.13, 43.06, 21.53, 10.77, 5.38, 2.69, 1.34]
res_window = [512, 1024, 2048, 4096, 8192, 16384, 32768]
model = pickle.load(open('../renyi_shannon_prollharm_model.sav', 'rb'))



In [13]:
for i in range(len(res_window)):
    #STFT
    res = res_window[i]
    print(res)
    result_stft = %timeit -n 3 -r 3 -o librosa.stft(y, n_fft=res)
    #CQT
    bpo = hz_to_binperoct(res_hz[i])
    result_cqt = %timeit -n 3 -r 3 -o librosa.cqt(y, sr=sr, bins_per_octave=bpo, fmin=20, n_bins=10*bpo)
    # LUKIN-TODD, SLS, SWGM
    max_window = res_window[i]
    window_lengths = [int(max_window/8), int(max_window/2), max_window]
    kernel_anal = calc_kernel_size(window_lengths)
    kernel_energy = calc_kernel_size(window_lengths, energy=True)
    result_LT = %timeit -n 1 -r 1 -o LT(y, window_lengths, kernel_anal)
    result_SWGM = %timeit -n 3 -r 3 -o swgm_time(y, window_lengths)
    result_SLS = %timeit -n 1 -r 1 -o SLS_time(y, window_lengths, kernel_anal, kernel_energy)
    # OUR SOLUTION
    res = np.max([int(res_window[i] // 2048), 1])
    result_OURS = %timeit -n 3 -r 3 -o our_solution(y, res)
    
    result_file = 'results_' + str(res_window[i]) + '.csv'

    with open(result_file, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
        writer.writerow(['STFT', str(result_stft)])
        writer.writerow(['CQT', str(result_cqt)])
        writer.writerow(['LT', str(result_LT)])
        writer.writerow(['SWGM', str(result_SWGM)])
        writer.writerow(['SLS', str(result_SLS)])
        writer.writerow(['OURS', str(result_OURS)])
    

512
2.97 ms ± 523 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)
50.5 ms ± 1.99 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
5.93 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
8.1 ms ± 311 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)
10.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
77.5 ms ± 1.69 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
1024
3.15 ms ± 548 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)
51.4 ms ± 1.26 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
11.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
13.8 ms ± 292 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)
21.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
76.6 ms ± 1.66 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
2048
3.13 ms ± 294 µs per loop (mean ± std. dev. of 3 runs, 3 loops each)
52.9 ms ± 2.34 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)
24.6 s ± 0 ns per loop (mean ± std. 

In [20]:
file_name = '../../data/MIDI-Unprocessed_R1_D1-1-8_mid--AUDIO-from_mp3_03_R1_2015_wav--2.wav'
y, sr = librosa.load(file_name, sr=44100)
y = y[len(y)//2:len(y)//2+30*sr]

In [21]:
for i in range(len(res_window)):
#     #STFT
#     res = res_window[i]
#     print(res)
#     result_stft = %timeit -n 3 -r 3 -o librosa.stft(y, n_fft=res)
#     #CQT
#     bpo = hz_to_binperoct(res_hz[i])
#     result_cqt = %timeit -n 3 -r 3 -o librosa.cqt(y, sr=sr, bins_per_octave=bpo, fmin=20, n_bins=10*bpo)
    # LUKIN-TODD, SLS, SWGM
    max_window = res_window[i]
    window_lengths = [int(max_window/8), int(max_window/2), max_window]
    kernel_anal = calc_kernel_size(window_lengths)
    kernel_energy = calc_kernel_size(window_lengths, energy=True)
    result_LT = %timeit -n 1 -r 1 -o LT(y, window_lengths, kernel_anal)
#     result_SWGM = %timeit -n 3 -r 3 -o swgm_time(y, window_lengths)
    result_SLS = %timeit -n 1 -r 1 -o SLS_time(y, window_lengths, kernel_anal, kernel_energy)
#     # OUR SOLUTION
#     res = np.max([int(res_window[i] // 2048), 1])
#     result_OURS = %timeit -n 3 -r 3 -o our_solution(y, res)
    
    result_file = 'results_costly' + str(res_window[i]) + '.csv'

    with open(result_file, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=';')
#         writer.writerow([file_name, 'STFT', str(result_stft)])
#         writer.writerow([file_name, 'CQT', str(result_cqt)])
        writer.writerow([file_name, 'LT', str(result_LT)])
#         writer.writerow([file_name, 'SWGM', str(result_SWGM)])
        writer.writerow([file_name, 'SLS', str(result_SLS)])
#         writer.writerow([file_name, 'OURS', str(result_OURS)])

1min 22s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2min 33s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
2min 59s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
5min 25s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
6min 15s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
11min 10s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
12min 46s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
22min 41s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
25min 40s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
45min 43s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
51min 13s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1h 32min 47s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


KeyboardInterrupt: 

In [None]:
Audio(sound_file, autoplay=True)