<h1>Homework #2 - Template Based Chord Recognition</h1>

In [13]:
import os
import copy
import re
import numpy as np
from scipy import signal
import librosa
import pandas
import libfmp.b
from matplotlib import pyplot as plt

<h2>Configurations</h2>

In [14]:
sample_rate=44100
window_length=4096
hop_size=2048
norm_input='2'
norm_output='max'
norm_threshold=0.0001
smooth_filter_length=41
smooth_filter_window_type='hanning'
down_sampling_factor=10
feature_normalization=True
feature_compression=True
feature_smoothing=True
feature_downsample=True

<h2>Features processing functions</h2>

In [15]:
def compress_feature_sequence(feature_sequence, gamma=0.1):
    """ TODO

    Args:

    Returns:

    """

    compressed_feature_sequence = np.log(1 + gamma * np.abs(feature_sequence) ** 2)
    return compressed_feature_sequence

def normalize_feature_sequence(feature_sequence, norm='2', v=None):
    """Normalizes the columns of a feature sequence

    Args:
        feature_sequence: Feature sequence
        v: Used instead of normalization below `norm_threshold`. If None, uses unit vector for given norm

    Returns:
        feature_sequence_norm: Normalized feature sequence
    """

    K, N = feature_sequence.shape
    feature_sequence_norm = np.zeros((K, N))

    if v is None:
        v = np.ones(K, dtype=np.float64)

    if norm == '1':
        for n in range(N):
            s = np.sum(np.abs(feature_sequence[:, n]))
            if s > norm_threshold:
                feature_sequence_norm[:, n] = feature_sequence[:, n] / s
            else:
                feature_sequence_norm[:, n] = v / K

    if norm == '2':
        for n in range(N):
            s = np.sqrt(np.sum(feature_sequence[:, n] ** 2))
            if s > norm_threshold:
                feature_sequence_norm[:, n] = feature_sequence[:, n] / s
            else:
                feature_sequence_norm[:, n] = v / np.sqrt(K)

    if norm == 'max':
        for n in range(N):
            s = np.max(np.abs(feature_sequence[:, n]))
            if s > norm_threshold:
                feature_sequence_norm[:, n] = feature_sequence[:, n] / s
            else:
                feature_sequence_norm[:, n] = v

    return feature_sequence_norm

def smooth_feature_sequence(feature_sequence):
    """
    Args:
        X: Feature sequence
        Fs: Frame rate of `X`
        filt_len: Length of smoothing filter
        down_sampling: Downsampling factor
        w_type: Window type of smoothing filter

    Returns:
        X_smooth: Smoothed and downsampled feature sequence
        Fs_feature: Frame rate of `X_smooth`
    """

    filter_kernel = signal.get_window(smooth_filter_window_type, smooth_filter_length)
    # use expand dims to add one dimension to the window, from (L, ) to (1,L)
    expanded_filter_kernel = np.expand_dims(filter_kernel, axis=0)
    # mode='same' guarantees that the final length of the convolution is equal to the length of feature_sequence
    smoothed_feature_sequence = signal.convolve(feature_sequence, expanded_filter_kernel, mode='same') / smooth_filter_length
    return smoothed_feature_sequence

def downsample_feature_sequence(feature_sequence, feature_rate):
    """ TODO

    Args:

    Returns:

    """

    downsampled_feature_sequence = feature_sequence[:, ::down_sampling_factor]
    downsampled_fs = feature_rate / down_sampling_factor
    return downsampled_feature_sequence, downsampled_fs

<h2>Template-based chord recognition steps</h2>

In [16]:
def load_audio(wav_file_path: str):
    """ Load WAV audio file from a system path

    Args:
        wav_file_path (str): System path to a WAV file

    Returns:
        audio_file: (np.ndarray): Audio signal
        audio_duration (float): Duration in seconds of the audio signal
    """

    audio_file, Fs = librosa.load(wav_file_path, sr=sample_rate)
    audio_duration = audio_file.shape[0] / sample_rate

    return audio_file, audio_duration

def chroma_representation(audio_file):
    """ TODO

    Args:

    Returns:
        X (np.ndarray): Chromagram
        Fs_X (scalar): Feature reate of chromagram
        x (np.ndarray): Audio signal
        Fs (scalar): Sampling rate of audio signal
        x_dur (float): Duration (seconds) of audio signal
    """

    # Compute chroma features with STFT
    chroma_features = librosa.stft(audio_file, n_fft=window_length, hop_length=hop_size, pad_mode='constant', center=True)
    chroma_features = np.abs(chroma_features) ** 2

    if feature_compression:
        compress_feature_sequence(chroma_features)

    chroma_features = librosa.feature.chroma_stft(S=chroma_features, sr=sample_rate, tuning=0, norm=None, hop_length=hop_size, n_fft=window_length)
    chroma_feature_rate = sample_rate / hop_size

    return chroma_features, chroma_feature_rate

def generate_triads_templates():
    """Generate chord templates of major and minor triads

    Returns:
        chord_templates (np.ndarray): Matrix containing chord_templates as columns
    """

    template_cmaj = np.array([[1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]]).T
    template_cmin = np.array([[1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]]).T
    c_triads_templates = np.concatenate((template_cmaj, template_cmin), axis=1)
    num_chord = 12 * c_triads_templates.shape[1]
    triads_templates = np.ones((12, num_chord))
    for shift in range(12):
        shifted_templates = np.roll(c_triads_templates, shift, axis=0)
        triads_templates[:, shift::12] = shifted_templates

    return triads_templates

def generate_chord_labels():
    """Generate chord labels for major and minor triads

    Returns:
        chord_labels (list): List of chord labels
    """
    chroma_labels = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
    chord_labels_maj = chroma_labels
    chord_labels_min = [s + 'm' for s in chroma_labels]
    chord_labels = chord_labels_maj + chord_labels_min
    return chord_labels

def pre_processing(chroma_features, chroma_feature_rate, triads_templates):
    """ TODO

    Args:

    Returns:

    """

    processed_chroma_feature = chroma_features
    processed_triads_templates = triads_templates

    if feature_normalization:
        processed_chroma_feature = normalize_feature_sequence(processed_chroma_feature, norm=norm_input)
        processed_triads_templates = normalize_feature_sequence(triads_templates, norm=norm_input)

    if feature_smoothing:
        processed_chroma_feature = smooth_feature_sequence(processed_chroma_feature)

    if feature_downsample:
        processed_chroma_feature, downsampled_rate = downsample_feature_sequence(processed_chroma_feature, chroma_feature_rate)
        return processed_chroma_feature, processed_triads_templates, downsampled_rate

    return processed_chroma_feature, processed_triads_templates

def pattern_matching(chroma_features, triads_template):
    """ TODO

    Args:

    Returns:

    """

    chord_similarity = np.matmul(triads_template.T, chroma_features)

    return chord_similarity

def post_processing(chord_similarity):
    """ TODO

    Args:

    Returns:

    """

    processed_chord_similarity = chord_similarity

    if feature_normalization:
        processed_chord_similarity = normalize_feature_sequence(processed_chord_similarity, norm=norm_output)

    return processed_chord_similarity

def recognition_result(chord_similarity, triads_templates):
    """ TODO

    Args:

    Returns:

    """
    chord_labels = generate_chord_labels()
    chord_max = (chord_similarity == chord_similarity.max(axis=0))
    chord_labels_indexes = np.arange(24)
    recognized_chord_labels = []
    for i in range(chord_max.shape[1]):
        recognized_chord_label_index = chord_labels_indexes[chord_max[:, i]][0]
        recognized_chord_labels = recognized_chord_labels + [chord_labels[recognized_chord_label_index]]

    return recognized_chord_labels, chord_max, chord_labels

<h2>Template-based chord recognition implementation</h2>

In [17]:
def compute_template_based_chord_recognition(audio_file_path):
    """ TODO

    Args:

    Returns:

    """

    # Load audio file
    audio_file, audio_duration = load_audio(audio_file_path)

    # Compute chromagram
    chroma_features, chroma_feature_rate = chroma_representation(audio_file)

    # Generate triads template
    triads_template = generate_triads_templates()

    # Pre-processing
    if feature_downsample:
        chroma_features, triads_template, chroma_feature_rate = pre_processing(chroma_features, chroma_feature_rate, triads_template)
    else:
        chroma_features, triads_template = pre_processing(chroma_features, chroma_feature_rate, triads_template)

    # Pattern matching
    chord_similarity = pattern_matching(chroma_features, triads_template)

    # Post-processing
    chord_similarity = post_processing(chord_similarity)

    # Chord recognition
    recognized_chord_labels, chord_max, chord_labels = recognition_result(chord_similarity, triads_template)

    return recognized_chord_labels, chord_max, chord_labels, chord_similarity, chroma_features, chroma_feature_rate, triads_template, audio_file, audio_duration

<h2>Perform template-based chord recognition and plot results</h2>

In [None]:
# Perform chords recognition
audio_file_path = os.path.join('data', 'wav', 'Beatles_LetItBe.wav')
recognized_chord_labels, chord_max, chord_labels, chord_similarity, chroma_features, chroma_feature_rate, triads_template, audio_file, audio_duration = compute_template_based_chord_recognition(audio_file_path)

# Annotations
time_frames_number = len(recognized_chord_labels)
time_axis = np.arange(time_frames_number) / chroma_feature_rate
chord_annotation_seconds = []
for i in range(1, time_frames_number):
    start = time_axis[i - 1]
    end = time_axis[i]
    chord_annotation_seconds = chord_annotation_seconds + [(start, end, recognized_chord_labels[i - 1])]

color_ann = {'C': [1, 0.5, 0, 1], 'G': [0, 1, 0, 1], 'Am': [1, 0, 0, 1], 'F': [0, 0, 1, 1], 'Em': [1, 0.5, 1, 0.5]}
# Figure configuration
cmap = libfmp.b.compressed_gray_cmap(alpha=1, reverse=False)
fig, ax = plt.subplots(4, 2, figsize=(20, 24), gridspec_kw={'width_ratios': [1, 0.03], 'height_ratios': [1.5, 3, 3, 3]})

# Plot audio signal
libfmp.b.plot_signal(audio_file, sample_rate, ax=ax[0,0], title='Audio signal')
libfmp.b.plot_segments_overlay(chord_annotation_seconds, ax=ax[0,0], time_max=audio_duration, print_labels=False, colors=color_ann, alpha=0.1)
ax[0,1].axis('off')

# PLot chromagram
title = 'STFT-based chromagram (feature rate = %0.1f Hz)' % chroma_feature_rate
libfmp.b.plot_chromagram(chroma_features, ax=[ax[1,0], ax[1,1]], Fs=chroma_feature_rate, clim=[0, 1], xlabel='', title=title)
libfmp.b.plot_segments_overlay(chord_annotation_seconds, ax=ax[1,0], time_max=audio_duration,  print_labels=False, colors=color_ann, alpha=0.1)

# Plot similarity matrix
title = 'Time–chord representation of chord similarity matrix'
libfmp.b.plot_matrix(chord_similarity, ax=[ax[2, 0], ax[2, 1]], Fs=chroma_feature_rate, title=title, ylabel='Chord', xlabel='')
ax[2, 0].set_yticks(np.arange(len(chord_labels)))
ax[2, 0].set_yticklabels(chord_labels)
libfmp.b.plot_segments_overlay(chord_annotation_seconds, ax=ax[2, 0], time_max=audio_duration, print_labels=False, colors=color_ann, alpha=0.1)

# Plot chord recognition results
title = 'Time–chord representation of chord recognition result'
libfmp.b.plot_matrix(chord_max, ax=[ax[3, 0], ax[3, 1]], Fs=chroma_feature_rate, title=title, ylabel='Chord', xlabel='')
ax[3, 0].set_yticks(np.arange(len(chord_labels)))
ax[3, 0].set_yticklabels(chord_labels)
ax[3, 0].grid()
libfmp.b.plot_segments_overlay(chord_annotation_seconds, ax=ax[3, 0], time_max=audio_duration, print_labels=False, time_label='Time (seconds)', colors=color_ann, alpha=0.1)

plt.tight_layout()

<h2>Ground truth processing functions</h2>

In [19]:
def read_csv(csv_file_path):

    ground_truth_csv = pandas.read_csv(csv_file_path, sep=',', keep_default_na=False, header=0)

    segment_annotation_indices = []
    for i, (start, end, label) in ground_truth_csv.iterrows():
        start_index = int(np.round(start * chroma_feature_rate))
        end_index = int(np.round(end * chroma_feature_rate))
        segment_annotation_indices = segment_annotation_indices + [(start_index, end_index, label)]

    return segment_annotation_indices

def convert_segment_annotations(segment_annotation_indices):

    frame_labels_sequence = []
    for segment in segment_annotation_indices:
        segment_indices_count = segment[1] - segment[0]
        for k in range(segment_indices_count):
            frame_labels_sequence.append(segment[2])

    # Pad frame label sequence to match number of frames if needed
    frame_labels_sequence_padding = chord_max.shape[1] - len(frame_labels_sequence)
    for i in range(frame_labels_sequence_padding):
        frame_labels_sequence.append(frame_labels_sequence[-1])

    return frame_labels_sequence

def get_binary_time_chord_matrix(labels_sequence):

    time_chord_matrix = np.zeros((len(chord_labels), len(labels_sequence)))
    for i in range(time_chord_matrix.shape[1]):
        chord_label = labels_sequence[i]
        if chord_label in chord_labels:
            label_index = chord_labels.index(chord_label)
            time_chord_matrix[label_index, i] = 1

    return time_chord_matrix

def normalize_chord_labels(chord_labels):
    """Replace for segment-based annotation in each chord label the string ':min' by 'm'
    and convert flat chords into sharp chords using enharmonic equivalence

    Args:
        ann (list): Segment-based annotation with chord labels

    Returns:
        ann_conv (list): Converted segment-based annotation with chord labels
    """

    normalized_chord_labels = copy.deepcopy(chord_labels)

    for i in range(len(chord_labels)):
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Db', 'C#')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Eb', 'D#')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Fb', 'E')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Gb', 'F#')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Ab', 'G#')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Bb', 'A#')
        normalized_chord_labels[i] = normalized_chord_labels[i].replace('Cb', 'B')
        normalized_chord_labels[i] = re.sub(r':(min|hdim|dim)\d{0,2}\(?[b#]?\d{0,2}\)?', 'm', normalized_chord_labels[i])
        normalized_chord_labels[i] = re.sub(r':(maj|sus)?\d{0,2}\(?[b#]?\d{0,2}\)?', '', normalized_chord_labels[i])
        normalized_chord_labels[i] = re.sub(r':?\/[b#]?\d{0,2}', '', normalized_chord_labels[i])

    return normalized_chord_labels

<h2>Ground truth reading implementation</h2>

In [20]:
def read_ground_truth(csv_file_path):
    """Convert segment-based chord annotation into various formats

    Args:
        csv_file_path (str): Filename of segment-based chord annotation

    Returns:
        ann_matrix (np.ndarray): Encoding of label sequence in form of a binary time-chord representation
        ann_frame (list): Label sequence (specified on the frame level)
    """

    segment_annotation_indices = read_csv(csv_file_path)

    ground_truth_chord_labels = convert_segment_annotations(segment_annotation_indices)

    normalized_ground_truth_chord_labels = normalize_chord_labels(ground_truth_chord_labels)

    ground_truth_matrix = get_binary_time_chord_matrix(normalized_ground_truth_chord_labels)

    return normalized_ground_truth_chord_labels, ground_truth_matrix

<h2>Perform ground truth reading</h2>

In [None]:
# Perform ground truth reading
csv_file_path = os.path.join('data', 'csv', 'Beatles_LetItBe.csv')
ground_truth_chord_labels, ground_truth_matrix = read_ground_truth(csv_file_path)

# Annotations
time_frames_number = len(ground_truth_chord_labels)
time_axis = np.arange(time_frames_number) / chroma_feature_rate
chord_annotation_seconds = []
for i in range(1, time_frames_number):
    start = time_axis[i - 1]
    end = time_axis[i]
    chord_annotation_seconds = chord_annotation_seconds + [(start, end, ground_truth_chord_labels[i - 1])]

# Plot
cmap = libfmp.b.compressed_gray_cmap(alpha=1, reverse=False)
fig, ax = plt.subplots(1, 2, figsize=(20, 8), gridspec_kw={'width_ratios': [1, 0.03], 'height_ratios': [2]})

# Plot chord recognition results
title='Time–chord representation of reference annotations'
libfmp.b.plot_matrix(ground_truth_matrix, ax=[ax[0], ax[1]], Fs=chroma_feature_rate, title=title, ylabel='Chord', xlabel='')
ax[0].set_yticks(np.arange(len(chord_labels)))
ax[0].set_yticklabels(chord_labels)
libfmp.b.plot_segments_overlay(chord_annotation_seconds, ax=ax[0], time_max=audio_duration, print_labels=False, time_label='Time (seconds)', colors=color_ann, alpha=0.1)
ax[0].grid()

plt.tight_layout()

<h2>Metric evaluation implementation</h2>

In [22]:
def compute_eval_measures(chord_max, ground_truth_matrix):
    """Compute evaluation measures including precision, recall, and F-measure

    Notebook: C5/C5S2_ChordRec_Eval.ipynb

    Args:
        I_ref (np.ndarray): Reference set of items
        I_est (np.ndarray): Set of estimated items

    Returns:
        P (float): Precision
        R (float): Recall
        F (float): F-measure
        num_TP (int): Number of true positives
        num_FN (int): Number of false negatives
        num_FP (int): Number of false positives
    """
    assert ground_truth_matrix.shape == chord_max.shape, "Dimension of input matrices must agree"
    true_positive = np.sum(np.logical_and(ground_truth_matrix, chord_max))
    false_positive = np.sum(chord_max > 0, axis=None) - true_positive
    precision = 0
    if true_positive > 0:
        precision = true_positive / (true_positive + false_positive)
    return precision

<h2>Perform metric evaluation</h2>

In [23]:
recognition_precision = compute_eval_measures(chord_max, ground_truth_matrix)
print('Let It Be - Recognition precision: %s' % round(recognition_precision * 100, 2) + '%')

Let It Be - Recognition precision: 59.16%


<h2>Compute evaluation for other files</h2>

In [None]:
# Here Comes The Sun
audio_file_path = os.path.join('data', 'wav', 'Beatles_HereComesTheSun.wav')
recognized_chord_labels, chord_max, chord_labels, chord_similarity, chroma_features, chroma_feature_rate, triads_template, audio_file, audio_duration = compute_template_based_chord_recognition(audio_file_path)

csv_file_path = os.path.join('data', 'csv', 'Beatles_HereComesTheSun.csv')
ground_truth_chord_labels, ground_truth_matrix = read_ground_truth(csv_file_path)

recognition_precision = compute_eval_measures(chord_max, ground_truth_matrix)
print('Here Comes The Sun - Recognition precision: %s' % round(recognition_precision * 100, 2) + '%')

# ObLaDi ObLaDa
audio_file_path = os.path.join('data', 'wav', 'Beatles_ObLaDiObLaDa.wav')
recognized_chord_labels, chord_max, chord_labels, chord_similarity, chroma_features, chroma_feature_rate, triads_template, audio_file, audio_duration = compute_template_based_chord_recognition(audio_file_path)

csv_file_path = os.path.join('data', 'csv', 'Beatles_ObLaDiObLaDa.csv')
ground_truth_chord_labels, ground_truth_matrix = read_ground_truth(csv_file_path)

recognition_precision = compute_eval_measures(chord_max, ground_truth_matrix)
print('ObLaDi ObLaDa - Recognition precision: %s' % round(recognition_precision * 100, 2) + '%')

# Penny Lane
audio_file_path = os.path.join('data', 'wav', 'Beatles_PennyLane.wav')
recognized_chord_labels, chord_max, chord_labels, chord_similarity, chroma_features, chroma_feature_rate, triads_template, audio_file, audio_duration = compute_template_based_chord_recognition(audio_file_path)

csv_file_path = os.path.join('data', 'csv', 'Beatles_PennyLane.csv')
ground_truth_chord_labels, ground_truth_matrix = read_ground_truth(csv_file_path)

recognition_precision = compute_eval_measures(chord_max, ground_truth_matrix)
print('Penny Lane - Recognition precision: %s' % round(recognition_precision * 100, 2) + '%')