In [None]:
! pip install mir_eval

Collecting mir_eval
  Downloading mir_eval-0.7.tar.gz (90 kB)
[?25l[K     |███▋                            | 10 kB 21.8 MB/s eta 0:00:01[K     |███████▎                        | 20 kB 25.4 MB/s eta 0:00:01[K     |██████████▉                     | 30 kB 9.1 MB/s eta 0:00:01[K     |██████████████▌                 | 40 kB 3.8 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 3.7 MB/s eta 0:00:01[K     |█████████████████████▊          | 61 kB 4.4 MB/s eta 0:00:01[K     |█████████████████████████▎      | 71 kB 4.6 MB/s eta 0:00:01[K     |█████████████████████████████   | 81 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 90 kB 3.4 MB/s 
Building wheels for collected packages: mir-eval
  Building wheel for mir-eval (setup.py) ... [?25l[?25hdone
  Created wheel for mir-eval: filename=mir_eval-0.7-py3-none-any.whl size=100721 sha256=23a75b6b85476659dc379e73790ebb14f9c39a79deb8c85ff00b62e969dfb973
  Stored in directory: /root/.cache/pip/wheels

In [None]:
# Beat tracking example
import librosa
import librosa.display

import numpy as np
import matplotlib.pyplot as plt
import mir_eval.sonify


from IPython.display import Audio


In [None]:
# 1. Get the file path to an included audio example
filename = librosa.example('nutcracker')

Downloading file 'Kevin_MacLeod_-_P_I_Tchaikovsky_Dance_of_the_Sugar_Plum_Fairy.ogg' from 'https://librosa.org/data/audio/Kevin_MacLeod_-_P_I_Tchaikovsky_Dance_of_the_Sugar_Plum_Fairy.ogg' to '/root/.cache/librosa'.


In [None]:

# 2. Load the audio as a waveform `y` represented as a one-dimensional NumPy floating point array.
#    Store the sampling rate as `sr`: the number of samples per second of audio
#    By default, all audio is mixed to mono and resampled to 22050 Hz at load time. 
#    This behavior can be overridden by supplying additional arguments to librosa.load.
y, sr = librosa.load(filename)

In [None]:
len(y)

2643264

In [None]:
sr

22050

In [None]:
# 3. Run the default beat tracker 
#    estimate of the tempo (in beats per minute)

# Frames here correspond to short windows of the signal (y), each separated by hop_length = 512 samples.
#  librosa uses centered frames, so that the kth frame is centered around sample k * hop_length.
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)

print('Estimated tempo: {:.2f} beats per minute'.format(tempo))


Estimated tempo: 107.67 beats per minute


In [None]:
len(beat_frames)

212

In [None]:
# 4. Convert the frame indices of beat events into timestamps
beat_times = librosa.frames_to_time(beat_frames, sr=sr)

In [None]:
beat_times

array([  1.18421769,   1.71827664,   2.32199546,   2.87927438,
         3.45977324,   4.01705215,   4.59755102,   5.13160998,
         5.7353288 ,   6.29260771,   6.84988662,   7.40716553,
         7.9876644 ,   8.54494331,   9.12544218,   9.65950113,
        10.21678005,  10.72761905,  11.28489796,  11.79573696,
        12.32979592,  12.86385488,  13.42113379,  13.95519274,
        14.4892517 ,  15.02331066,  15.55736961,  16.09142857,
        16.62548753,  17.15954649,  17.69360544,  18.25088435,
        18.80816327,  19.31900227,  19.87628118,  20.38712018,
        20.92117914,  21.4552381 ,  21.98929705,  22.52335601,
        23.05741497,  23.59147392,  24.12553288,  24.65959184,
        25.19365079,  25.72770975,  26.26176871,  26.81904762,
        27.35310658,  27.88716553,  28.44444444,  29.00172336,
        29.55900227,  30.11628118,  30.67356009,  31.20761905,
        31.78811791,  32.34539683,  32.85623583,  33.36707483,
        33.90113379,  34.43519274,  34.94603175,  35.45

In [None]:
# Load the example clip
y, sr = librosa.load(librosa.ex('nutcracker'))

# Set the hop length; at 22050 Hz, 512 samples ~= 23ms
hop_length = 512

In [None]:
# Separate harmonics and percussives into two waveforms
y_harmonic, y_percussive = librosa.effects.hpss(y)

In [None]:
len(y_harmonic)

2643264

In [None]:
# Beat track on the percussive signal
tempo, beat_frames = librosa.beat.beat_track(y=y_percussive,
                                             sr=sr)



In [None]:
# Compute MFCC features from the raw signal
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)

# And the first-order differences (delta features)
mfcc_delta = librosa.feature.delta(mfcc)

# Stack and synchronize between beat events
# This time, we'll use the mean value (default) instead of median
beat_mfcc_delta = librosa.util.sync(np.vstack([mfcc, mfcc_delta]),
                                    beat_frames)

In [None]:
beat_mfcc_delta.shape




(26, 212)

In [None]:

# Compute chroma features from the harmonic signal
chromagram = librosa.feature.chroma_cqt(y=y_harmonic,
                                        sr=sr)




In [None]:
chromagram.shape


(12, 5163)

In [None]:
# Aggregate chroma features between beat events
# We'll use the median value of each feature between beat frames
beat_chroma = librosa.util.sync(chromagram,
                                beat_frames,
                                aggregate=np.median)


In [None]:
beat_chroma.shape

(12, 212)

In [None]:

# Finally, stack all beat-synchronous features together
beat_features = np.vstack([beat_chroma, beat_mfcc_delta])

In [None]:
beat_features.shape

(38, 212)

In [None]:
beat_features

array([[ 0.30254456,  0.0334522 ,  0.1058374 , ...,  0.15276003,
         0.07555082,  0.0814149 ],
       [ 0.25782844,  0.04074808,  0.08428392, ...,  0.09633808,
         0.12807782,  0.0932549 ],
       [ 0.3997867 ,  0.0575659 ,  0.09429981, ...,  0.44611084,
         0.186589  ,  0.13245766],
       ...,
       [-0.06548901, -0.14644985,  0.05339273, ..., -0.16999006,
         0.62217766,  0.12908453],
       [-0.02321963, -0.13737966, -0.01143883, ..., -0.24983512,
         0.2491345 ,  0.00830472],
       [ 0.02864804, -0.20102443, -0.05542237, ...,  0.07502007,
        -0.61561984,  0.07628296]], dtype=float32)

In [None]:
sr = 22050

y_sweep = librosa.chirp(fmin=librosa.note_to_hz('C3'),
                        fmax=librosa.note_to_hz('C5'),
                        sr=sr,
                        duration=1)

Audio(data=y_sweep, rate=sr)

In [None]:
# Using fill_na=None retains the best-guess f0 at unvoiced frames
f0, voiced_flag, voiced_probs = librosa.pyin(y,
                                             sr=sr,
                                             fmin=librosa.note_to_hz('C2'),
                                             fmax=librosa.note_to_hz('C7'),
                                             fill_na=None)

# To synthesize the f0, we'll need sample times
times = librosa.times_like(f0)

In [None]:
vneg = (-1)**(~voiced_flag)

# And sonify the f0 using mir_eval
y_f0 = mir_eval.sonify.pitch_contour(times, f0 * vneg, sr)

Audio(data=y_f0, rate=sr)

In [None]:
# Compute the onset strength envelope, using a max filter of 5 frequency bins
# to cut down on false positives
onset_env = librosa.onset.onset_strength(y=y, sr=sr, max_size=5)

# Detect onset times from the strength envelope
onset_times = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time')

# Sonify onset times as clicks
y_clicks = librosa.clicks(times=onset_times, length=len(y), sr=sr)

Audio(data=y+y_clicks, rate=sr)