In [1]:
import pretty_midi
import numpy as np
import mir_eval.display
import librosa.display
import matplotlib.pyplot as plt
import numba
import scipy
import vamp
import tempfile
import glob
import os
import sox
import csv
import jams

from IPython.display import Audio
%matplotlib inline

In [2]:
def save_small_wav(out_path, y, fs):
    fhandle, tmp_file = tempfile.mkstemp(suffix='.wav')

    librosa.output.write_wav(tmp_file, y, fs)

    tfm = sox.Transformer()
    tfm.convert(bitdepth=16)
    tfm.build(tmp_file, out_path)
    os.close(fhandle)
    os.remove(tmp_file)
    
def note_anal(y, fs, seg_start_time, outname):    
    offset_time, pitch_track, t_step = segment_offset(y, fs, seg_start_time)
    
    with open(outname+'_pt.csv', 'a') as pt:
        writer = csv.writer(pt, delimiter=',')
        for i, f in enumerate(pitch_track):
            writer.writerow([seg_start_time + i*float(t_step) , f])
    
    with open(outname+'_onoff.csv', 'a') as onoff:
        writer = csv.writer(onoff, delimiter=',')
        writer.writerow([seg_start_time, offset_time])
    return 0

def estimate_onset(audiopath, midipath=None, offset=0):
    fname = audiopath.split("/")[-1].split(".")[0]
    if midipath is None:
        midipath = './onset_post_logic/'+fname+'.mid'
    y, sr = librosa.load(audiopath, sr=None)
    pm = pretty_midi.PrettyMIDI(midipath)
    human_onsets = pm.get_onsets() + offset
    human_onsets_samples = librosa.time_to_samples(human_onsets, sr=sr)
    print(human_onsets_samples)

    adjusted_onsets = adjust_onsets(human_onsets_samples, y, sr)
    adj_on_samps = librosa.time_to_samples(adjusted_onsets, sr=sr)

    return adjusted_onsets

def adjust_onsets(human_onsets_samples, y, sr, span_time=0.15, verbose=False, hop_length=32, n_fft=64, onset_channels=[16, 50, 80, 100, 128]):
    adjusted_onsets = []
    span = int(span_time * sr)
    for i, hos in enumerate(human_onsets_samples):
        if i % 50 == 0:
            print(i, len(human_onsets_samples))
        if i != 0:
            last_onset = int((adjusted_onsets[-1]+0.05) * sr)
        else:
            last_onset = 0
        low_idx = max(0, hos-span)
        low_idx = max(low_idx, last_onset)
        high_idx = min(hos+span, len(y))
        window = np.hanning(librosa.samples_to_frames(span*2, hop_length=hop_length) + 1)
        win_start_idx = librosa.samples_to_frames(low_idx - (hos-span), hop_length=hop_length)
        win_end_idx = librosa.samples_to_frames(high_idx - (hos-span), hop_length=hop_length) + 1
        window = window[win_start_idx:win_end_idx]
        window = window.reshape(1,-1)
        
        seg_start_time = librosa.samples_to_time(low_idx, sr=sr)
        y_seg = y[low_idx:high_idx]
        
        onsets_str = librosa.onset.onset_strength_multi(y=y_seg, hop_length=hop_length,channels=onset_channels, sr=sr)
        rms = librosa.feature.rmse(y=y_seg, hop_length=hop_length)
        onsets_str_mean = np.mean(onsets_str, axis=0)
        onsets_str_mean_weighted = rms * onsets_str_mean
        
        if (window.shape[1] != onsets_str_mean_weighted.shape[1]):
            min_len = min([window.shape[1], onsets_str_mean_weighted.shape[1]])
            window = window[:, 0:min_len]
            onsets_str_mean_weighted = onsets_str_mean_weighted[:, 0:min_len]
                           
        windowed_str = window * onsets_str_mean_weighted
        onsets_str_t = librosa.frames_to_time(np.arange(windowed_str.shape[1]), hop_length=hop_length, sr=sr) + seg_start_time
        onsets = librosa.onset.onset_detect(onset_envelope=windowed_str.flatten(), hop_length=hop_length, units='time', sr=sr)
        peaks_onset = librosa.frames_to_time(np.argmax(windowed_str), hop_length=hop_length, sr=sr)
        adjusted_onsets.append(seg_start_time + peaks_onset)
#         print(windowed_str.shape[1])
        if verbose:
#             print(seg_start_time, i)
            plt.figure()
            S = librosa.stft(y_seg, n_fft=n_fft, hop_length=hop_length)
            ax = plt.subplot(2, 1, 1)
            power_spec = librosa.amplitude_to_db(S, ref=np.max)
            librosa.display.specshow(power_spec, y_axis='log', x_axis='time', hop_length=hop_length, x_coords=onsets_str_t.flatten(), sr=sr)
            plt.subplot(2, 1, 2, sharex=ax)
            plt.vlines(onsets+seg_start_time, 0, 0.1, colors='g')
            plt.vlines(peaks_onset+seg_start_time, 0, 0.1, colors='r')
            plt.plot(onsets_str_t.flatten(), windowed_str.flatten())
            plt.show()
            
        if (human_onsets_samples[0] < 100) :
            adjusted_onsets[0] = 0
    return adjusted_onsets

def old_note_anal(y, fs, seg_start_time, outname):
    fhandle, tmp_file = tempfile.mkstemp(suffix='.wav')

    librosa.output.write_wav(tmp_file, y, fs)
    done = False
    cmd = 'python3 note_anal.py {} {} {} {}'.format(
        tmp_file, seg_start_time, outname+'_pt.csv', outname+'_onoff.csv')
    while not done:
        err = os.system(cmd)
        if err:
            print('vamp.collect errored, trying again...')
        else: # successful, no seg fault
            done = True
            
    os.close(fhandle)
    os.remove(tmp_file)
    return 0

def stem_anal(outname, audiopath):
    done = False
    cmd = 'python3 stem_anal.py {} {} {}'.format(
        audiopath, outname, outname+'_onset.jams')
    while not done:
        err = os.system(cmd)
        if err:
            print('vamp.collect errored, trying again...')
        else: # successful, no seg fault
            done = True
    return 0

def save_as_jams(outname, audiopath, adjusted_onsets):
    jam = jams.JAMS()
    jam.file_metadata.duration = sox.file_info.duration(audiopath)
    ann = jams.Annotation(
        namespace='onset', time=0,
        duration=jam.file_metadata.duration
    )
    ann.annotation_metadata.data_source = str(outname.split('_')[-1])
    for onset_time in adjusted_onsets:
        ann.append(time=onset_time, duration=0)
    jam.annotations.append(ann)
    jam.save(outname+'_onset.jams')
    return jam

In [3]:
audiopaths = glob.glob('./Bounces/*.wav')
audiopaths.sort()
print(audiopaths)

['./Bounces/0_c_1.wav', './Bounces/1_c_1.wav', './Bounces/2_c_1.wav', './Bounces/3_c_1.wav', './Bounces/4_c_1.wav', './Bounces/5_c_1.wav']


In [4]:
for audiopath in audiopaths:
#     audiopath = audiopaths[1]
    print("new audio file!")
    fname = audiopath.split('/')[-1].split('.')[0]
    outname = os.path.join('./output',fname)
    adjusted_onsets = estimate_onset(audiopath)
    jam = save_as_jams(outname, audiopath, adjusted_onsets)
    stem_anal(outname, audiopath)

new audio file!
[  326382   335838   348177   358396   367344   378369   387570   398341
   407076   418313   429296   439048   448505   458512   468731   478314
   489212   498965   510456   520464   530683   540945   551164   561426
   571391   581101   591109   601328   611081   620791   631053   814577
   825093   835312   845828   855793   876529   886536   897010   907271
   917491   926099   937208   946707   956926  1129637  1141001  1150712
  1171447  1181200  1191674  1201426  1211646  1221229  1232127  1242389
  1252353  1262615  1272580  1282545  1292807  2156375  2170892  2186755
  2200843  2215665  2230549  2244637  2258173  2287389  2302702  2317340
  2333204  2347782  2360583  2376447  2641414  2657033  2671856  2685698
  2701562  2714118  2728451  2742538  2757606  2771999  2787863  2802441
  2816773  2831902  2847214  2963712  2979086  3497233  3515129  3589626
  3619323  3626583  3635900  3646226  3663850  3682484  3691801  3711755
  3738386  3767307  3860243  388761

50 916
100 916
150 916
200 916
250 916
300 916
350 916
400 916
450 916
500 916
550 916
600 916
650 916
700 916
750 916
800 916
850 916
900 916
new audio file!
[  327696   348686   368149 ... 51128602 51164433 51199280]
0 1062
50 1062
100 1062
150 1062
200 1062
250 1062
300 1062
350 1062
400 1062
450 1062
500 1062
550 1062
600 1062
650 1062
700 1062
750 1062
800 1062
850 1062
900 1062
950 1062
1000 1062
1050 1062
new audio file!
[  651025   672269   681853 ... 51247662 51259179 51271189]
0 1674
50 1674
100 1674
150 1674
200 1674
250 1674
300 1674
350 1674
400 1674
450 1674
500 1674
550 1674
600 1674
650 1674
700 1674
750 1674
800 1674
850 1674
900 1674
950 1674
1000 1674
1050 1674
1100 1674
1150 1674
1200 1674
1250 1674
1300 1674
1350 1674
1400 1674
1450 1674
1500 1674
1550 1674
1600 1674
1650 1674
new audio file!
[  654841   693768   726292 ... 51248154 51259672 51271730]
0 1833
50 1833
100 1833
150 1833
200 1833
250 1833
300 1833
350 1833
400 1833
450 1833
500 1833
550 1833
600 1833
6