In [None]:
!pip install libfmp

Collecting libfmp
  Downloading libfmp-1.2.5-py3-none-any.whl (110 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/110.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting music21<6.0.0,>=5.7.0 (from libfmp)
  Downloading music21-5.7.2.tar.gz (18.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.5/18.5 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pretty-midi<1.0.0,>=0.2.0 (from libfmp)
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m94.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython<8.0.0,>=7.10.0->libfmp)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━

In [None]:
import os
import numpy as np
from matplotlib import pyplot as plt
import librosa

import libfmp.b
import libfmp.c3
import libfmp.c4
from scipy.stats import mode


In [4]:
def chroma(audio, hop = 4096, window = 4096, norm = 1, version = "CQT"):
  x, sr = librosa.load(audio)

  if version == "CQT":
    #compute chroma features by CQT method and in-built column wise normalization
    X = librosa.feature.chroma_cqt(y=x, sr=sr, hop_length = hop, norm = norm)
  if version == "STFT":
    X = librosa.stft(x, n_fft=window, hop_length=hop, pad_mode='constant', center=True)
    X = np.abs(X) ** 2
    X = librosa.feature.chroma_stft(S=X, sr = sr, hop_length = hop, n_fft = window, norm = norm)
  return X, x, sr

def plot_chroma(X, version = "STFT"):
  # Plotting the chromagram
  plt.imshow(X, cmap='hot', origin='lower', aspect='auto')
  plt.xlabel('Time')
  plt.ylabel('Pitch Class')
  plt.title(f"Chromagram with {version}")
  plt.colorbar()

  # Show the plot
  plt.show()



In [None]:
#X, x, sr = chroma(file_path, version = "CQT")
#plot_chroma(X)
#Y, y, sr = chroma(file_path, version = "STFT")
#plot_chroma(Y)

In [None]:
#laying out chord structures
chords = np.empty((24,12))
for i in range(12):
  #compute the i-th major starting from
  chords[i] = np.eye(12)[i] + np.eye(12)[(1+i)%12]*0.3 + np.eye(12)[(4+i)%12] + np.eye(12)[(7+i)%12] + np.eye(12)[(10+i)%12]*0.1 + np.eye(12)[(11+i)%12]*0.3

  #compute the i-th minor start from C
  chords[i+12] = np.eye(12)[i] + np.eye(12)[(1+i)%12]*0.3 + np.eye(12)[(3+i)%12] + np.eye(12)[(7+i)%12] + np.eye(12)[(10+i)%12]*0.3

chord_map = {0:"C", 1:"C#", 2:"D", 3:"D#", 4:"E", 5:"F", 6:"F#", 7:"G", 8:"G#", 9:"A", 10:"A#", 11:"B", 12:"Cm", 13:"C#m", 14:"Dm", 15:"D#m", 16:"Em", 17:"Fm", 18:"F#m", 19:"Gm", 20:"G#m", 21:"Am", 22:"A#m", 23:"Bm"}
chord_map_reversed = {"C": 0, "C#": 1, "D": 2, "D#": 3, "E": 4, "F": 5, "F#": 6, "G": 7, "G#": 8, "A": 9, "A#": 10, "B": 11, "Cm": 12, "C#m": 13, "Dm": 14, "D#m": 15, "Em": 16, "Fm": 17, "F#m": 18, "Gm": 19, "G#m": 20, "Am": 21, "A#m": 22, "Bm": 23}

In [None]:
def compute_sim(X, chords, key_i, key_scaling=False, sf = 1.3, key = "C"):
  chords_norm = libfmp.c3.normalize_feature_sequence(chords, norm='2')
  X_norm = libfmp.c3.normalize_feature_sequence(X, norm='2')
  sim_matrix = np.matmul(chords_norm, X_norm)
  sim_norm = libfmp.c3.normalize_feature_sequence(sim_matrix, norm='2')


  if key_scaling:

    #increase weight on chords that are diatonic to the key
    if key_i <= 11:
      for j in [(i+key_i)%12 for i in [0, 5, 7]]:
        sim_norm[j] *= sf
      for j in [(i+key_i)%12+12 for i in [2,4,9,11]]:
        sim_norm[j] *= sf
    else:
      for j in [(i+key_i)%12 for i in [3,8,10]]:
        sim_norm[j] *= sf
      for j in [(i+key_i)%12+12 for i in [0,2,5,7]]:
        sim_norm[j] *= sf


  return sim_norm
def decision(sim_mat):
  #computing the most probable chord
  max_values = np.argmax(sim_mat, axis=0)

  #mapping it with our chord_map
  mapped_func = np.vectorize(lambda x: chord_map.get(x, x))
  mapped_arr = mapped_func(max_values)
  return mapped_arr


In [5]:
def template_predict(file_path, key, key_filtering = True, version = "STFT"):
  Y, y, sr = chroma(file_path, version)

  plot_chroma(Y, version)
  if key != "None":
    key_i = chord_map_reversed[key]
  else:
    key_i, key = predict_key(Y)
  if key_filtering == True:
    print("The predicted key is:", key)
    sim_y = compute_sim(Y, chords, key_i, key_scaling = True)
  else:
    sim_y = compute_sim(Y, chords, key_i)

  decision_y = decision(sim_y)

  output, asdf, asf = output_formatting(decision_y, y)

  return output, sr, asdf, asf


In [None]:
def predict_key(Y):
  keys = np.empty((24,12))
  for i in range(12):
  #compute the i-th major starting from C
    keys[i] = np.eye(12)[i]*0.3 + np.eye(12)[(2+i)%12]*0.1 + np.eye(12)[(4+i)%12]*0.1 + np.eye(12)[(5+i)%12]*0.1 + np.eye(12)[(7+i)%12]*0.2 + np.eye(12)[(9+i)%12]*0.1 + np.eye(12)[(11+i)%12]*0.05

    #compute the i-th minor start from C
    keys[i+12] = np.eye(12)[i]*0.3 + np.eye(12)[(2+i)%12]*0.1 + np.eye(12)[(3+i)%12]*0.1 + np.eye(12)[(5+i)%12]*0.1 + np.eye(12)[(7+i)%12]*0.2 + np.eye(12)[(8+i)%12]*0.1 + np.eye(12)[(10+i)%12]*0.05

  keys_norm = libfmp.c3.normalize_feature_sequence(keys, norm='2')
  Y_norm = libfmp.c3.normalize_feature_sequence(Y, norm='2')
  sim_matrix = np.matmul(keys_norm, Y_norm)
  sim_norm = libfmp.c3.normalize_feature_sequence(sim_matrix, norm='2')
  max_values = np.argmax(sim_norm, axis=0).astype(int)

  max_value = mode(max_values)[0]

  key_map = {0:"C", 1:"C#", 2:"D", 3:"D#", 4:"E", 5:"F", 6:"F#", 7:"G", 8:"G#", 9:"A", 10:"A#", 11:"B", 12:"Cm", 13:"C#m", 14:"Dm", 15:"D#m", 16:"Em", 17:"Fm", 18:"F#m", 19:"Gm", 20:"G#m", 21:"Am", 22:"A#m", 23:"Bm"}
  return max_value, key_map[max_value]

In [None]:
#sim_x = compute_sim(X, chords)
#sim_y = compute_sim(Y, chords)

#decision_x = decision(sim_x)
#decision_y = decision(sim_y)

#print(decision_x)
#print(decision_y)

In [None]:
def output_formatting(decision, x):
  cnt = 0
  avg_duration = x.shape[0] / 22050 / decision.shape[0]
  res = []
  for i in decision:
    res.append([chord_map_reversed[i], cnt, cnt+avg_duration])
    cnt += avg_duration
  return res, x.shape[0], decision.shape[0]

In [None]:
def mode_filter(decision, size=5):
  declen = len(decision)
  newdec = []
  for i in range(len(decision)):
    if i <= 5:

      parse = [a for a,j,k in decision[0:5+i+1]]
    elif i >= len(decision) - 5:
      parse = [a for a,j,k in decision[-5+i:]]
    else:
      parse = [a for a,j,k in decision[-5+i:5+i+1]]
    parse = mode(parse)[0]

    newdec.append((parse, decision[i][1], decision[i][2]))
  return newdec



In [None]:
#Y, y, sr = chroma("/content/perfect.wav", version = "STFT")

#template_predict("/content/perfect.wav", key_filtering = True)