# Input features
TODO
+ add padding at start to avoid errors if onset in first frame


In [None]:
import sys
sys.path.append('../')

from hvo_sequence.hvo_seq import HVO_Sequence

import os
import numpy as np
import pickle
import pandas as pd


### Load processed dataset and select random example

In [None]:
source_path = "../processed_dataset/Processed_On_27_04_2021_at_19_04_hrs"
print(os.path.join(source_path, "GrooveMIDI_processed_train", "hvo_data.obj"))
train_file = open(os.path.join(source_path, "GrooveMIDI_processed_train", "hvo_sequence_data.obj"),'rb')
train_set = pickle.load(train_file)
dataset_size = len(train_set)
ix =  int(np.random.random_sample()*dataset_size)
example = train_set[ix]
example.to_html_plot(show_figure=True)

## Librosa onset

In [None]:
sr=44100
sf_path="../hvo_sequence/soundfonts/Standard_Drum_Kit.sf2"
n_fft=2048
hop_length=128
win_length=1024
window='hamming'
plot=True

#mel_spec = example.mel_spectrogram(sr=sr, sf_path=sf_path, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window = window, plot=plot)
#mel_spec.shape
stft = example.stft()
print(stft.shape)

#example.stft(plot=True)
import librosa.display
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_ylim(0,20000)
spec = librosa.display.specshow(librosa.amplitude_to_db(stft, ref=np.max), y_coords=binFreq,  y_axis='linear',x_axis='time',ax=ax)


In [None]:
import librosa.onset as onset
y = example.save_audio()
o_strength = onset.onset_strength(S=mel_spec[0:200,:])
o_detect = onset.onset_detect(y=y) # saca los onsets para todas las bandas
print(o_strength[0:100])


## Multi-band onset strength

In [None]:
# https://github.com/mcartwright/dafx2018_adt

import librosa
import scipy.signal
import numpy as np
import resampy
import soundfile as psf


FRAME_INTERVAL = 0.01  # s

def read_audio(filepath, sr=None, mono=True, peak_norm=False):
    """
    Read audio
    Parameters
    ----------
    filepath
    sr
    mono
    Returns
    -------
    y, sr
    """
    try:
        y, _sr = psf.read(filepath)
        y = y.T
    except RuntimeError:
        y, _sr = librosa.load(filepath, mono=False, sr=None)

    if sr is not None and sr != _sr:
        y = resampy.resample(y, _sr, sr, filter='kaiser_fast')
    else:
        sr = _sr

    if mono:
        y = librosa.to_mono(y)

    if peak_norm:
        y /= np.max(np.abs(y))

    return y, sr


def cq_matrix(bins_per_octave, num_bins, f_min, fft_len, sr):
    """
    Compute center frequencies of the log-spaced filterbank
    Parameters
    ----------
    bins_per_octave : int
    num_bins : int
    f_min : float
    fft_len : int
    sr : float
    Returns
    -------
    c_mat
    """
    # note range goes from -1 to bpo*num_oct for boundary issues
    f_cq = f_min * 2 ** ((np.arange(-1, num_bins+1)) / bins_per_octave)
    # centers in bins
    kc = np.round(f_cq * (fft_len / sr)).astype(int)
    c_mat = np.zeros([num_bins, int(np.round(fft_len / 2))])
    for k in range(1, kc.shape[0]-1):
        l1 = kc[k]-kc[k-1]
        w1 = scipy.signal.triang((l1 * 2) + 1)
        l2 = kc[k+1]-kc[k]
        w2 = scipy.signal.triang((l2 * 2) + 1)
        wk = np.hstack([w1[0:l1], w2[l2:]])  # concatenate two halves
        c_mat[k-1, kc[k-1]:(kc[k+1]+1)] = wk / np.sum(wk)  # normalized to unit sum;
    return c_mat


def onset_detection_fn(x, f_win_size, f_hop_size, f_bins_per_octave, f_octaves, f_fmin, sr, mean_filter_size):
    """
    Filter bank for onset pattern calculation
    """
    # calculate frequency constant-q transform
    f_win = scipy.signal.hanning(f_win_size)
    x_spec = librosa.stft(x,
                          n_fft=f_win_size,
                          hop_length=f_hop_size,
                          win_length=f_win_size,
                          window=f_win)
    x_spec = np.abs(x_spec) / (2 * np.sum(f_win))

    f_cq_mat = cq_matrix(f_bins_per_octave, f_octaves * f_bins_per_octave, f_fmin, f_win_size, sr)
    x_cq_spec = np.dot(f_cq_mat, x_spec[:-1, :])

    # subtract moving mean
    b = np.concatenate([[1], np.ones(mean_filter_size, dtype=float) / -mean_filter_size])
    od_fun = scipy.signal.lfilter(b, 1, x_cq_spec, axis=1)

    # half-wave rectify
    od_fun = np.maximum(0, od_fun)

    # post-process OPs
    od_fun = np.log10(1 + 1000*od_fun)
    return od_fun, x_cq_spec


def extract_features(audio_file_path, sr=22050, channel=1):
    x, sr = read_audio(audio_file_path, mono=True, sr=sr)

    f_win_size = 1024
    f_hop_size = int(round(FRAME_INTERVAL * sr))
    f_bins_per_octave = 8
    f_octaves = 8
    f_fmin = 40
    mean_filter_size = 22

    # normalize
    x /= np.max(np.abs(x))

    od_fun, x_cq_spec = onset_detection_fn(x,
                                           f_win_size,
                                           f_hop_size,
                                           f_bins_per_octave,
                                           f_octaves,
                                           f_fmin,
                                           sr,
                                           mean_filter_size)

    logf_stft = librosa.power_to_db(x_cq_spec).astype('float32')
    od_fun = np.abs(od_fun).astype('float32')

    # reshape for model
    ms_input_array = np.moveaxis(logf_stft, 1, 0)
    ms_input_array = np.expand_dims(ms_input_array, axis=2)
    os_input_array = np.moveaxis(od_fun, 1, 0)
    os_input_array = np.clip(os_input_array / 2.25, 0, 1)
    os_input_array = np.expand_dims(os_input_array, axis=2)

    return ms_input_array, os_input_array, sr

In [None]:
ms_input_array, os_input_array, sr = extract_features("./misc/temp.wav",sr=sr)
ms_input_array = ms_input_array.reshape(584,64) # remove last dimension
os_input_array = os_input_array.reshape(584,64)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
ax[0].pcolormesh(os_input_array.T)
ax[0].set_xlabel('Time')
ax[0].set_ylabel('f bin')
ax[0].set_title('Multi-band onset strength')
ax[1].set_xlabel('Time')
ax[1].set_ylabel('f bin')
ax[1].set_title('Logf–STFT')
ax[1].pcolormesh(ms_input_array.T)


In [None]:
# compare with spectrogram
sr=44100
sf_path="../hvo_sequence/soundfonts/Standard_Drum_Kit.sf2"
n_fft=2048
hop_length=221              # this was from a paper
win_length=1024
window='hamming'

# Get STFT
y = example.save_audio(sr=sr,sf_path=sf_path)
sy = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window)
stft = np.abs(sy)           # do not transform to db because db are negative
#stft = stft/np.max(stft)    # normalize

#plt.pcolormesh(librosa.amplitude_to_db(stft, ref=np.max))


## NMF
Features are frequency bins

In [None]:
import librosa
import sklearn
import matplotlib.pyplot as plt
import warnings 

warnings.filterwarnings("ignore")       # ignore librosa warnings

sr=44100
sf_path="../hvo_sequence/soundfonts/Standard_Drum_Kit.sf2"
n_fft=2048
hop_length=221              # this was from a paper
win_length=1024
window='hamming'
plot=True

# Get STFT
y = example.save_audio(sr=sr,sf_path=sf_path)
sy = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window)
stft = np.abs(sy)           # do not transform to db because db are negative
stft = stft/np.max(stft)    # normalize

# decomposition types (transformer types)
#T = sklearn.decomposition.MiniBatchDictionaryLearning(n_components=9)
#T = sklearn.decomposition.PCA(n_components=9)
T = sklearn.decomposition.NMF(n_components=7)
comps, acts = librosa.decompose.decompose(stft, sort=True, transformer=T)

# plot
binFreq = np.arange(n_fft/2+1)*float(sr)/n_fft   
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,10))
librosa.display.specshow(librosa.amplitude_to_db(comps,ref=np.max), y_axis='log', ax=ax[0],y_coords=binFreq)
ax[0].set(title='Components')
librosa.display.specshow(acts, x_axis='time', ax=ax[1])
ax[1].set(ylabel='Components', title='Activations')

### * Helpers

In [None]:
print(stft.shape)
numFrames = int(stft[0,:].size)
frmTime = hop_length*np.arange(numFrames)/float(sr); 
binFreq = np.arange(n_fft/2+1)*float(sr)/n_fft   
#plt.pcolormesh(frmTime, binFreq, librosa.amplitude_to_db(stft, ref=np.max))