In [1]:
import essentia
import essentia.standard as ess
import os
import json
import numpy as np
import madmom.features.chords as mdm
from IPython.display import clear_output
import mir_eval
import re
import time
import warnings
warnings.filterwarnings('ignore')

from platform import python_version
py_version = float(python_version()[:3])

# Automatic Chord Estimation (ACE) Task Overview

Description from MIREX(https://www.music-ir.org/mirex/wiki/2018:Audio_Chord_Estimation):

>This task requires participants to extract or transcribe a sequence of chords from an audio music recording. For many applications in music information retrieval, extracting the harmonic structure of an audio track is very desirable, for example for segmenting pieces into characteristic segments, for finding similar pieces, or for semantic analysis of music. The extraction of the harmonic structure requires the estimation of a sequence of chords that is as precise as possible. This includes the full characterisation of chords – root, quality, and bass note – as well as their chronological order, including specific onset times and durations.


## Dataset: Jazz Audio-Aligned Harmony (JAAH) Dataset

Documentation: https://mtg.github.io/JAAH/

## Feature Extraction

In [2]:
def computeTuning(filename, frameSize=4096, hopSize=2048):
    audio=ess.MonoLoader(filename=filename, sampleRate=44100)()
    
    frameGenerator = ess.FrameGenerator(audio, 
                                        frameSize=frameSize, 
                                        hopSize=hopSize)
    window = ess.Windowing(type='blackmanharris62')
    spectrum = ess.Spectrum()
    spectralPeaks = ess.SpectralPeaks(magnitudeThreshold=1e-05,
                                        maxFrequency=5000,
                                        minFrequency=40,
                                        maxPeaks=1000,
                                        orderBy="frequency",
                                        sampleRate=44100)
    tuning = ess.TuningFrequency()

    pool = essentia.Pool()
    
    for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True):
        spectrum_mag = spectrum(window(frame))
        frequencies, magnitudes = spectralPeaks(spectrum_mag)
        tuneFrame, _ = tuning(frequencies, magnitudes)
        pool.add('tuning', tuneFrame)
    
    return np.average(pool['tuning'])

def computeHPCP(filename, frameSize=4096, hopSize=2048, tuningFrequency=440.0):
    audio = ess.MonoLoader(filename=filename, sampleRate=44100)()
    
    frameGenerator = ess.FrameGenerator(audio, 
                                        frameSize=frameSize,
                                        hopSize=hopSize,
                                        startFromZero=True)
    window = ess.Windowing(type='blackmanharris62')
    spectrum = ess.Spectrum()
    spectralPeaks = ess.SpectralPeaks(magnitudeThreshold=1e-05,
                                      maxFrequency=5000,
                                      minFrequency=40,
                                      maxPeaks=1000,
                                      orderBy="frequency",
                                      sampleRate=44100)
    spectralWhitening = ess.SpectralWhitening(maxFrequency= 5000,
                                              sampleRate=44100)
    
    hpcp = ess.HPCP(sampleRate=44100,
                    maxFrequency=5000,
                    minFrequency=40,
                    referenceFrequency=tuningFrequency,
                    nonLinear=False,
                    harmonics=8,
                    size=12)
    
    key = ess.Key(profileType="tonictriad", usePolyphony=False)
    
    pool = essentia.Pool()
    
    for frame in frameGenerator:
        spectrum_mag = spectrum(window(frame))
        frequencies, magnitudes = spectralPeaks(spectrum_mag)
        w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes)
        hpcp_vector = hpcp(frequencies, w_magnitudes)
        pool.add('hpcp',hpcp_vector)
        
    return pool['hpcp']

def computeBeats(filename):
    audio = ess.MonoLoader(filename=filename, sampleRate=44100)()
    
    bt = ess.BeatTrackerMultiFeature()
    
    beats, confidence = bt(audio)
    beats = essentia.array([round(beat,2) for beat in beats])
    
    duration = len(audio) / 44100.0
    frameOnsets = np.arange(0, duration-(2048/44100), float(2048/44100.0))
    frameOnsets = [round(onset,2) for onset in frameOnsets] #2 decimals
    
    return beats, duration, frameOnsets


### Utility functions for creating '.lab' files

In [3]:
class ChordSegment:
    startTime = 0.0
    endTime = 0.0
    chord = ''
    def __init__(self, startTime, endTime, chord):
        self.startTime = startTime
        self.endTime = endTime
        self.chord = chord
    def __repr__(self):
        return '{:.2f}\t{:.2f}\t{}'.format(self.startTime, self.endTime, self.chord)

def createLabFileEssentia(duration, onsets, chords, strengths, save=False, outputName=''):
    if (len(onsets) == len(chords)):
        onsets.append(duration)
    
    allSegments = []
    
    if (0.0 < onsets[0]):
        allSegments.append(ChordSegment(0.0, onsets[0], 'N'))
    
    for i in range(len(chords)):
        chord = chords[i] if strengths[i] > 0 else 'N'
        allSegments.append(ChordSegment(onsets[i], onsets[i+1], chord))
        
    if (allSegments[-1].endTime < duration):
        allSegments.append(ChordSegment(allSegments[-1].endTime, duration, 'N'))
    
    mergedSegments = []
    currentSegment = allSegments[0]
    for segment in allSegments[1:]:
        if (segment.chord == currentSegment.chord):
            currentSegment.endTime = segment.endTime
        else:
            mergedSegments.append(currentSegment)
            currentSegment = segment
    mergedSegments.append(currentSegment)
    
    for segment in mergedSegments:
        segment.chord = re.sub('m$', ':min', segment.chord)
        
    if save:
        with open(outputName, 'w') as f:
            for s in mergedSegments:
                f.write('{}\n'.format(s))
        
    return mergedSegments

def createLabFileMadmom(chords, save=False, outputName=''):
    labSegments = []
    for segment in chords:
        labSegments.append('{:.2f}\t{:.2f}\t{}'.format(segment[0], segment[1], segment[2]))
        
    if save:
        with open(outputName, 'w') as f:
            for s in labSegments:
                f.write('{}\n'.format(s))
    
    return labSegments

if py_version == 3.5:
    def createLabFileCREMA(intervals, chords, save=False, outputName=''):
        labSegments = []

        for i in range(len(chords)):
            labSegments.append('{:.2f}\t{:.2f}\t{}'.format(intervals[i][0], intervals[i][1], chords[i]))

        if save:
            with open(outputName, 'w') as f:
                for s in labSegments:
                    f.write('{}\n'.format(s))

        return labSegments    

## ACE algorithms
### 1 - Essentia Chord Estimation by Frames

This function uses pre-defined chord templates to match the corresponding PCP to a chord estimate.

The chord vocabulary of this function consists of major and minor chords. 

You can easily give a PCP tensor to the algorithm to get the corresponding chords and confidence values with feeding them to 'ChordsDetection' function.

In [4]:
def essentiaChordsByFrames(hpcp):    
    chord_extractor = ess.ChordsDetection()
    chords, strengths = chord_extractor(hpcp)
    
    return chords, strengths

The above function does not provide many parameters to modify. What 'ChordsDetection' function does can be seen below.

In [5]:
def estimateChordsByFrames(hpcp, onsets, hopSize=2048, profileType='tonictriad', usePolyphony=False):
    a, b = hpcp.shape
    
    key = ess.Key(profileType=profileType, usePolyphony=usePolyphony)
    
    estimatedChords = []
    estimatedStrengths = []
    
    numFramesWindow = int(2 * 44100 / hopSize) - 1
    
    for i in range(a):
        begin = max(0, i - (numFramesWindow/2) - 1)
        end = min(i + (numFramesWindow/2) - 1, a)
        
        meanFrame = np.mean(hpcp[begin:end], axis=0)
        maxVal = np.max(meanFrame)
        meanFrame /= maxVal
        
        estimatedKey, estimatedScale, estimatedStrength, _ = key(meanFrame)
        if estimatedScale == 'minor':
            estimatedChords.append('{}m'.format(estimatedKey))
        else:
            estimatedChords.append('{}'.format(estimatedKey))
        estimatedStrengths.append(estimatedStrength)
            
    return estimatedChords, estimatedStrengths

In the above function, we see that the algorithms calls another algorithm called 'Key', and processes the results accordingly. The documentation for 'Key' algorithm can be found in https://essentia.upf.edu/documentation/reference/std_Key.html. 

The default key profile is 'TonicTriad' but feel free to experiment with other profile types shown in the documentation.

### 2 - Essentia Chord Estimation by Beats

This algorithm aggregatates the PCP tensors between beats to obtain one PCP vector per beat.

The chord vocabulary of this function consists of major and minor chords.

Like the previous algorithm, you can simply feed PCP tensors to the function but this time another vector that gives the beat onsets should be provided as well.

In [6]:
def essentiaChordsByBeats(hpcp, beats):    
    chord_extractor = ess.ChordsDetectionBeats()    
    chords, strengths = chord_extractor(hpcp, beats)
    return chords, strengths

As in the previous case, 'ChordsDetectionBeats' function aggregates the PCP vectors and calls 'Key' algorithm to obtain a final chord estimate. Decomposed version of 'ChordsDetectionBeats' can be found below.

In [7]:
def computeChordsByBeats(hpcp, beats, hopSize=2048, profileType='tonictriad', usePolyphony=False):
    a, b = hpcp.shape
    
    key = ess.Key(profileType=profileType, usePolyphony=usePolyphony)
    
    estimatedChords = []
    estimatedStrengths = []
    
    for i in range(len(beats)-1):
        diffBeats = beats[i+1] - beats[i]
        
        numFramesWindow = int(diffBeats * 44100 / hopSize)
        
        begin = int(beats[i] * 44100 / hopSize)
        end = begin + numFramesWindow - 1
        if(begin>=end):
            end = begin+1
            
        medianFrame = np.median(hpcp[begin:end], axis=0)
        maxVal = np.max(medianFrame)
        medianFrame /= maxVal
        
        estimatedKey, estimatedScale, estimatedStrength, _ = key(medianFrame)
        if estimatedScale == 'minor':
            estimatedChords.append('{}m'.format(estimatedKey))
        else:
            estimatedChords.append('{}'.format(estimatedKey))
        estimatedStrengths.append(estimatedStrength)
    
    return estimatedChords, estimatedStrengths

For this function, the default key profile is again 'Tonic Triad', and the other key profiles can be found in the documentation of 'Key' algorithm: https://essentia.upf.edu/documentation/reference/std_Key.html

### 3 - Madmom Deep Chroma Chord Processor

This algorithm is described in the following paper: 

>Filip Korzeniowski and Gerhard Widmer, “Feature Learning for Chord Recognition: The Deep Chroma Extractor”, Proceedings of the 17th International Society for Music Information Retrieval Conference (ISMIR), 2016.

Korzeniowski and Widmer approaches the problem of Automatic Chord Estimation with emphasizing the importance of using a proper PCP for the chord estimation. They use a neural network that extracts 'Deep Chroma' features from a spectrogram with the following architecture:
<img src="figures/deepchroma.png" width="400">

After extracting the PCP tensors, in the original paper, they use a logistic regression classifier to map PCP vectors to chord labels, but in Madmom library, they use a post-processing step that uses Conditional Random Fields (CRF) to obtain the final chord label estimates.

The chord vocabulary of this method consists of major and minor chords.

In [8]:
from madmom.audio.chroma import DeepChromaProcessor
from madmom.features.chords import DeepChromaChordRecognitionProcessor

def madmomDeepChroma(filename):
    dcp = DeepChromaProcessor()
    decode = DeepChromaChordRecognitionProcessor()
    
    pcp = dcp(filename)
    estimatedChords = decode(pcp)
    
    return estimatedChords

### 4 - Madmom CRF Chord Processor

This algorithm is described in the following paper:

>Filip Korzeniowski and Gerhard Widmer, “A Fully Convolutional Deep Auditory Model for Musical Chord Recognition”, Proceedings of IEEE International Workshop on Machine Learning for Signal Processing (MLSP), 2016.

This algorithm uses a Convolutional Neural Network to obtain a latent representation to be used for chord estimation. The size of this latent representation is Tx128, T being the number of frames. The network architecture used in this feature extraction stage can be seen below:
<img src="figures/crf.png" width="400">

This latent representation is obtained before the last three layers. 

The obtained representation is then decoded with using a post-processing algorithm that uses Conditional Random Fields (CRF) to obtain final chord label estimations.

The chord vocabulary of this method consists of major and minor chords.

In [9]:
from madmom.features.chords import CNNChordFeatureProcessor
from madmom.features.chords import CRFChordRecognitionProcessor

def madmomCRF(filename):
    featproc = CNNChordFeatureProcessor()
    decode = CRFChordRecognitionProcessor()
    
    feats = featproc(filename)
    estimatedChords = decode(feats)
    
    return estimatedChords

### 5 - Convolutional and Recurrent Estimators for Music Analysis (CREMA) (Only for Python 3.5)

This algorithm is described in the following paper:
>Brian McFee, Juan Pablo Bello “Structured training for large-vocabulary chord recognition”, Proceedings of the 18th International Society for Music Information Retrieval Conference (ISMIR), 2017.

This method obtains a latent representation with a neural network that consists of convolutional and bi-directional gated recurrent unit (GRU) as can be seen in Figure 1. After this encoding, it uses the CR2+S architecture shown in Figure 3 to get Root, Pitch Classes and Bass representations to be used in chord label estimation. 
<img src="figures/encoder-crema.png" width="400"><img src="figures/network-crema.png" width="400">

An example of the Root, Pitch Classes and Bass representation can be seen in Figure 2.
<img src="figures/repr-crema.png" width="400">

The chord vocabulary of this method consists of maj, min, dim, aug, min6, maj6, min7, minmaj7, maj7, 7, dim7, hdim7, sus2 and sus4 chords. 


In [10]:
if py_version == 3.5:
    from crema.analyze import analyze

    def CREMA(filename):
        jam = analyze(filename=filename)
        intervals, chords = jam.annotations[0].to_interval_values()

        return intervals, chords

Using TensorFlow backend.


## Evaluation of the results

After feature extraction and automatic chord estimation, the final step is to evaluate the obtained results. For this, we can use 'mir_eval' library, and the documentation can be found in https://craffel.github.io/mir_eval/. 

In [11]:
def evaluateResults(queryFile, referenceFile):
    '''
    expects .lab files
    performs mir_eval on triads
    return an object called result '''
    
    refIntervals, refLabels = mir_eval.io.load_labeled_intervals(referenceFile)
    estIntervals, estLabels = mir_eval.io.load_labeled_intervals(queryFile)

    estIntervals, estLabels = mir_eval.util.adjust_intervals(estIntervals,
                                                             estLabels,
                                                             refIntervals.min(),
                                                             refIntervals.max(),
                                                             mir_eval.chord.NO_CHORD,
                                                             mir_eval.chord.NO_CHORD)

    intervals, refLabels, estLabels = mir_eval.util.merge_labeled_intervals(refIntervals,
                                                                            refLabels,
                                                                            estIntervals,
                                                                            estLabels)

    durations = mir_eval.util.intervals_to_durations(intervals)
    comparisons = mir_eval.chord.triads(refLabels, estLabels)
    score = mir_eval.chord.weighted_accuracy(comparisons, durations)

    return round(score, 2)

## Pipeline for benchmarking

In [12]:
def pipelineACE(filename, algos, labDir, labDirRef):
    songName = os.path.basename(filename).split('.')[0]
    
    # Feature extraction
    
    tuning = computeTuning(filename)
    pcp = computeHPCP(filename)
    beats, duration, onsets = computeBeats(filename)

    # Automatic chord estimation
    
    if 'frame' in algos:
        # 1 - Essentia chord estimation by frames
        chordsFrame, strengthsFrame = essentiaChordsByFrames(pcp)
        # Save lab files
        labFrameEssentia = createLabFileEssentia(duration, 
                                                 onsets, 
                                                 chordsFrame, 
                                                 strengthsFrame,
                                                 save=True,
                                                 outputName='{}{}_frame.lab'.format(labDir, songName))
    if 'beats' in algos:
        # 2 - Essentia chord estimation by beats
        chordsBeats, strengthsBeats = essentiaChordsByBeats(pcp, beats)
        # Save lab files
        labBeatsEssentia = createLabFileEssentia(duration,
                                                 beats,
                                                 chordsBeats,
                                                 strengthsBeats,
                                                 save=True,
                                                 outputName='{}{}_beats.lab'.format(labDir, songName))
    
    if 'deepchroma' in algos:
        # 3 - Madmom Deep Chroma chord processor
        chordsDeepChroma = madmomDeepChroma(filename)
        # Save lab files
        labDeepChromaMadmom = createLabFileMadmom(chordsDeepChroma, 
                                                  save=True,
                                                  outputName='{}{}_deepchroma.lab'.format(labDir, songName))

    if 'crf' in algos:
        # 4 - Madmom CRF chord processor
        chordsCRF = madmomCRF(filename)
        # Save lab files
        labCRFMadmom = createLabFileMadmom(chordsCRF, 
                                           save=True,
                                           outputName='{}{}_crf.lab'.format(labDir, songName))
    
    if 'crema' in algos:
        # 5 - CREMA (Only for Python 3.5)
        intervalsCREMA, chordsCREMA = CREMA(filename)
        # Save lab files
        labCREMA = createLabFileCREMA(intervalsCREMA, 
                                      chordsCREMA,
                                      save=True,
                                      outputName='{}{}_crema.lab'.format(labDir, songName))
        
    # Evaluation

    scores = {}

    for algo in algos:
        scores[algo] = evaluateResults('{}{}_{}.lab'.format(labDir, songName, algo), 
                                       '{}{}_ref.lab'.format(labDirRef, songName))

    return scores

In [13]:
def getAverageScores(scoreDict, algos):
    # Initializing lists for averaging
    scores = {}
    for algo in algos:
        scores[algo] = []
        
    for song in scoreDict.keys():
        for algo in algos:
            scores[algo].append(scoreDict[song][algo])
    
    averageResults = {}
    for key in scores.keys():
        averageResults[key] = round(np.mean(np.array(scores[key])), 2)
        
    return averageResults

## Processing all the files in the dataset

In [14]:
rootDir = 'songs_debug/'
labDir = 'labFiles/'
labDirRef = 'labFiles_ref/'

scoreDict = {}

algos = ['frame', 'beats', 'deepchroma', 'crf']
if py_version == 3.5:
    algos.append('crema')

for paths, subdirs, files in os.walk(rootDir):
    for file in files:
        songName = file.split('.')[0]

        startTime = time.time()
        
        print('Processing file: {}'.format(file))
        
        scoreDict[songName] = pipelineACE('{}{}'.format(rootDir, file), algos, labDir, labDirRef)
        
        print('Total processing time for {} is {:.2f} seconds'.format(file, time.time()-startTime))
        
averageScores = getAverageScores(scoreDict, algos)

clear_output()
for algo in averageScores.keys():
    print('Score for {} method is {}'.format(algo, averageScores[algo]))

Score for frame method is 0.4
Score for crema method is 0.56
Score for crf method is 0.55
Score for deepchroma method is 0.5
Score for beats method is 0.37
