Install required dependencies

In [None]:
!pip install libfmp

import librosa
import libfmp.b
import libfmp.c3
import libfmp.c4
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

Collecting libfmp
  Downloading libfmp-1.2.5-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting music21<6.0.0,>=5.7.0 (from libfmp)
  Downloading music21-5.7.2.tar.gz (18.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pretty-midi<1.0.0,>=0.2.0 (from libfmp)
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython<8.0.0,>=7.10.0->libfmp)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
Collecting mido>=1.1.1

File upload

Input preprocessing using Librosa
1. Load the audio file
2. Estimate the tuning of an audio time series or spectrogram input
3. Perform pitch shift on the audio (1 n_steps is a semitone if bins_per_octave = 12)
4. Compute the constant-Q transform of an audio signal

In [None]:
def preprocess_librosa(audiopath, n_bins=84, bins_per_octave=12, mod_steps=(0,)):
    x, sr = librosa.load(audiopath, sr=11025, mono=True) # converting the audio to mono
    Xs = []
    tuning = librosa.estimate_tuning(y=x, sr=sr)
    for mod_step in mod_steps:
        X_pitched = librosa.effects.pitch_shift(x, sr=sr, n_steps=mod_step)
        X = np.abs(librosa.core.cqt(X_pitched, sr=sr, n_bins=n_bins, bins_per_octave=bins_per_octave, tuning=tuning, window='hamming', norm=2))
        Xs.append(X.T)
    return Xs

Our model is trained in Bi-directional LSTM

Training dataset:

Number of epoches: 150

Learning rate: 0.01

Batch size: 64

Scheduler step size: 100

Scheduler's gamma: 0.1

Hidden dimension: 200

Number of layers: 2

Optimizer: SGD

SGD momentum: 0.8

Source of model: https://github.com/krist311/chords-recognition/blob/master/README.md



In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_dim, output_size, num_layers, use_gpu, bidirectional,
                 dropout=(0.4, 0.0, 0.0)):
        super(LSTMClassifier, self).__init__()
        self.use_gpu = use_gpu
        self.input_size = input_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1
        self.dropout1 = nn.Dropout(p=dropout[0])
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers=self.num_layers, batch_first=True,
                            bidirectional=bidirectional, dropout=dropout[1])
        self.bn1 = nn.BatchNorm1d(hidden_dim * self.num_directions)
        self.dropout2 = nn.Dropout(p=dropout[2])
        self.hidden2out = nn.Linear(hidden_dim * self.num_directions, output_size)

    def disable_dropout(self):
        self.lstm.dropout = .0
        self.dropout1.p = .0
        self.dropout2.p = .0

    def init_hidden(self, batch_size):
        if torch.cuda.is_available():
            return (
                torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim,
                            ).cuda(),
                torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim,
                            ).cuda())
        else:
            return (
                torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_dim))

    def forward(self, batch, lengths=None):
        self.hidden = self.init_hidden(batch.size(0))
        batch = self.dropout1(batch)
        # pack sequence if lengths available(during training)
        if lengths:
            batch = pack_padded_sequence(batch, lengths, batch_first=True)
        # batch = batch.to("cuda").to(torch.float64)
        # hidden - [1, batch_size, hidden_dim]
        output, self.hidden = self.lstm(batch, self.hidden)
        if lengths:
            output, _ = pad_packed_sequence(output, batch_first=True)
        # [batch_size,seq_len, hidden_dim]
        output = self.bn1(output.permute(0, 2, 1)).permute(0, 2, 1)
        output = self.dropout2(output)
        output = self.hidden2out(output)
        return output

In [None]:
# Converting the tensor to the desired output format
def note2num(note):
    return {'C': '1', 'C#': '2', 'Db': '2', 'D': '3', 'D#': '4', 'Eb': '4',
            'E': '5', 'Fb': '5', 'F': '6', 'F#': '7', 'Gb': '7', 'G': '8', 'G#': '9', 'Ab': '9', 'A': '10', 'A#': '11',
            'Bb': '11', 'B': '12',
            'Cb': '12'}.get(note, 0)


def create_chords_list():
    chord_types = [('maj', ('1', '3', '5')), ('min', ('1', 'b3', '5'))]
    num_to_ind = {'-1': -1, '0': 0}
    ind_to_name = ['N']

    for note in ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']:
        for chord_type in chord_types:
            for bass in ('',):
                num_to_ind[f"{note2num(note)}:{chord_type[0]}{bass}"] = len(ind_to_name)
                ind_to_name.append(f"{note}:{chord_type[0]}{bass}")
    return num_to_ind, ind_to_name


def ind_to_chord_names(inds):
    _, ind_to_name = create_chords_list()
    return [ind_to_name[ind] for ind in inds]


def preds_to_output(y, hop_size=512, fs=11025):
    results = []
    start_time = 0.0
    chord_names = ind_to_chord_names(y)
    tw = (hop_size / fs)  # time ticks
    y_prev = chord_names[0]
    for i, chord_name in enumerate(chord_names, 1):
        if chord_name == y_prev and i != len(chord_names):
            continue
        end_time = i * tw
        results.append((start_time, end_time,	y_prev))
        start_time = end_time
        y_prev = chord_name
    return results

In [3]:
def t(model, X):
    with torch.no_grad():
        if torch.cuda.is_available():
            X = torch.tensor(X).cuda()
        else:
            X = torch.tensor(X)
        pred = model(X)
        y = pred.topk(1, dim=2)[1].squeeze().view(-1)
        return preds_to_output(y)

"""
y_size, y_ind = 25, -8 # label size = 25,
model = LSTMClassifier(input_size=84, hidden_dim=200, output_size=y_size,
                        num_layers=2,
                        use_gpu=torch.cuda.is_available(), bidirectional=True, dropout=(0.4, 0.0, 0.0))
if torch.cuda.is_available():
    model = model.cuda()
    model.load_state_dict(torch.load("LSTM.1_opt_SGD"))
else:
    model.load_state_dict(torch.load("LSTM.1_opt_SGD", map_location='cpu'))
model.eval()
result = t(model, X)
print(result)
"""

'\ny_size, y_ind = 25, -8 # label size = 25,\nmodel = LSTMClassifier(input_size=84, hidden_dim=200, output_size=y_size,\n                        num_layers=2,\n                        use_gpu=torch.cuda.is_available(), bidirectional=True, dropout=(0.4, 0.0, 0.0))\nif torch.cuda.is_available():\n    model = model.cuda()\n    model.load_state_dict(torch.load("LSTM.1_opt_SGD"))\nelse:\n    model.load_state_dict(torch.load("LSTM.1_opt_SGD", map_location=\'cpu\'))\nmodel.eval()\nresult = t(model, X)\nprint(result)\n'