## Load our own descriptors from the Lakh Dataset and perform Classification with scikit-learn




In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pretty_midi
# import librosa
import mir_eval
import mir_eval.display
import tables
import IPython.display
import os
import json
from tqdm import tqdm
import pickle

# Local path constants
DATA_PATH = 'data'
RESULTS_PATH = 'results'
# Path to the file match_scores.json distributed with the LMD
SCORE_FILE = os.path.join(RESULTS_PATH, 'match_scores.json')
# The Tonnetze used for graph generation
TONNETZEDICT = {'T129': [1, 2, 9], 'T147': [1, 4, 7], 'T237': [2, 3, 7], 'T345': [3, 4, 5]}
# Define the labels we want to find (we can add or remove)
LABELS = ["classic", "rock", "pop", "folk", "metal", "jazz", "disco"]

# Utility functions for retrieving paths
def msd_id_to_dirs(msd_id):
    """Given an MSD ID, generate the path prefix.
    E.g. TRABCD12345678 -> A/B/C/TRABCD12345678"""
    return os.path.join(msd_id[2], msd_id[3], msd_id[4], msd_id)

def msd_id_to_mp3(msd_id):
    """Given an MSD ID, return the path to the corresponding mp3"""
    return os.path.join(DATA_PATH, 'msd', 'mp3',
                        msd_id_to_dirs(msd_id) + '.mp3')

def msd_id_to_h5(h5):
    """Given an MSD ID, return the path to the corresponding h5"""
    return os.path.join(RESULTS_PATH, 'lmd_matched_h5',
                        msd_id_to_dirs(msd_id) + '.h5')

def get_midi_path(msd_id, midi_md5, kind):
    """Given an MSD ID and MIDI MD5, return path to a MIDI file.
    kind should be one of 'matched' or 'aligned'. """
    return os.path.join(RESULTS_PATH, 'lmd_{}'.format(kind),
                        msd_id_to_dirs(msd_id), midi_md5 + '.mid')

### Trial Classification

##### We try a graph kernel method by first reading the Data

In [None]:
# Debugger
from IPython.core.debugger import set_trace

# Set the desirable Tonnetz
Tonnetz = "T147"


with open(SCORE_FILE) as f:
    # The json SCORE_FILE is a dict of ids from Million Dollar
    scores = json.load(f)
y = list()
G = list()
for msd_id in tqdm(scores.keys()):
    # open every directory with midi files and save file dir to var msd_dir
    msd_dir = msd_id_to_dirs(msd_id)
    # add the directory where graphs where saved
    graph_directory =  os.path.join(RESULTS_PATH, "lmd_graphs", msd_dir)
    # first check if the directory isn't empty
    if os.listdir(graph_directory):
        for file in os.listdir(graph_directory):
            if file.endswith(Tonnetz+".p"):
                Gi = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                yi = pickle.load( open( os.path.join(graph_directory, file.split("_graph_" + Tonnetz + ".p")[0]+ "_label.p"), "rb" ) )
                if yi != None:
                    set_trace()
                    y.append(yi)
                    G.append(Gi)

  0%|                                                                                        | 0/31034 [00:00<?, ?it/s]

> [1;32m<ipython-input-3-1eaae97066b4>[0m(26)[0;36m<module>[1;34m()[0m
[1;32m     23 [1;33m                [0myi[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m [0mopen[0m[1;33m([0m [0mos[0m[1;33m.[0m[0mpath[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mgraph_directory[0m[1;33m,[0m [0mfile[0m[1;33m.[0m[0msplit[0m[1;33m([0m[1;34m"_graph_"[0m [1;33m+[0m [0mTonnetz[0m [1;33m+[0m [1;34m".p"[0m[1;33m)[0m[1;33m[[0m[1;36m0[0m[1;33m][0m[1;33m+[0m [1;34m"_label.p"[0m[1;33m)[0m[1;33m,[0m [1;34m"rb"[0m [1;33m)[0m [1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     24 [1;33m                [1;32mif[0m [0myi[0m [1;33m!=[0m [1;32mNone[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     25 [1;33m                    [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 26 [1;33m                    [0my[0m[1;33m.[0m[0mappend[0m[1;33m([0m[0myi[0m[1;33m)[0m[1;33m[0m[1

  0%|                                                                            | 1/31034 [00:20<174:57:21, 20.30s/it]

> [1;32m<ipython-input-3-1eaae97066b4>[0m(25)[0;36m<module>[1;34m()[0m
[1;32m     23 [1;33m                [0myi[0m [1;33m=[0m [0mpickle[0m[1;33m.[0m[0mload[0m[1;33m([0m [0mopen[0m[1;33m([0m [0mos[0m[1;33m.[0m[0mpath[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mgraph_directory[0m[1;33m,[0m [0mfile[0m[1;33m.[0m[0msplit[0m[1;33m([0m[1;34m"_graph_"[0m [1;33m+[0m [0mTonnetz[0m [1;33m+[0m [1;34m".p"[0m[1;33m)[0m[1;33m[[0m[1;36m0[0m[1;33m][0m[1;33m+[0m [1;34m"_label.p"[0m[1;33m)[0m[1;33m,[0m [1;34m"rb"[0m [1;33m)[0m [1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     24 [1;33m                [1;32mif[0m [0myi[0m [1;33m!=[0m [1;32mNone[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 25 [1;33m                    [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     26 [1;33m                    [0my[0m[1;33m.[0m[0mappend[0m[1;33m([0m[0myi[0m[1;33m)[0m[1;33m[0m[1

In [11]:
print("We have", len(y), "labels")
print("and", len(G), "graphs")

for label in LABELS+["other"]:
    print('{} has occurred {} times'.format(label, y.count(label)) )

We have 17490 labels
and 17490 graphs
classic has occurred 1111 times
rock has occurred 6469 times
pop has occurred 4714 times
folk has occurred 413 times
metal has occurred 524 times
jazz has occurred 1917 times
disco has occurred 1118 times
other has occurred 1224 times


In [12]:
from sklearn.model_selection import train_test_split
from grakel.utils import graph_from_networkx

# Transform networkx graphs to grakel representations
G = list(graph_from_networkx(G, node_labels_tag='note'))

# Splits the dataset into a training and a test set
G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.3, shuffle=True, stratify=y)

In [13]:
from grakel.kernels import WeisfeilerLehman, GraphletSampling
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Uses the WEisfeilerLehman kernel to generate the kernel matrices
gk = WeisfeilerLehman(n_iter=4, normalize=True)
K_train = gk.fit_transform(G_train)
K_test = gk.transform(G_test)

# Uses the SVM classifier to perform classification
clf = SVC(kernel="precomputed")
clf.fit(K_train, y_train)
y_pred = clf.predict(K_test)

# Computes and prints the classification accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", str(round(acc*100, 2)) + "%")

Accuracy: 50.03%


### Compute Classification Metrics

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score

clf_conf = plot_confusion_matrix(clf, K_test, y_test)

### Classification via merging disconnected subgraphs into a Big Graph.

Taking the disjoint union for all graphs per datapoint merging in a single big graph for classification via kernels.

In [None]:
# Debugger
from IPython.core.debugger import set_trace

import networkx as nx

with open(SCORE_FILE) as f:
    # The json SCORE_FILE is a dict of ids from Million Dollar
    scores = json.load(f)
y = list()
Gi = dict()
G = list()
for msd_id in tqdm(scores.keys()):
    # open every directory with midi files and save file dir to var msd_dir
    msd_dir = msd_id_to_dirs(msd_id)
    # add the directory where graphs where saved
    graph_directory =  os.path.join(RESULTS_PATH, "lmd_graphs", msd_dir)
    key = list(TONNETZEDICT.keys())
    # first check if the directory isn't empty
    if len(os.listdir(graph_directory))>1:
        try: 
            for file in os.listdir(graph_directory):
                IsGraphDir = False
                if file.endswith(key[0] +".p"):
                    Gi[key[0]] = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    IsGraphDir = True
                if file.endswith(key[1] +".p"):
                    Gi[key[1]] = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    IsGraphDir = True
                if file.endswith(key[2] +".p"):
                    Gi[key[2]] = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    IsGraphDir = True
                if file.endswith(key[3] +".p"):
                    Gi[key[3]] = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    IsGraphDir = True
                if file.endswith("label.p"):
                    yi = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    IsGraphDir = True
        except:
            IsGraphDir = False
        # the disjoint union of graphs (have to chek if a graph exists)
        if IsGraphDir:
            set_trace()
            U = nx.disjoint_union(Gi["T345"], nx.disjoint_union(Gi["T129"], nx.disjoint_union(Gi["T237"], Gi["T147"])))
            G.append(U)
            y.append(yi)
            IsGraphDir = False

  0%|                                                                                        | 0/31034 [00:00<?, ?it/s]

> [1;32m<ipython-input-2-777665a38689>[0m(43)[0;36m<module>[1;34m()[0m
[1;32m     41 [1;33m        [1;32mif[0m [0mIsGraphDir[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     42 [1;33m            [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 43 [1;33m            [0mU[0m [1;33m=[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T345"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T129"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T237"[0m[1;33m][0m[1;33m,[0m [0mGi[0m[1;33m[[0m[1;34m"T147"[0m[1;33m][0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     44 [1;33m            [0mG[0m[1;33m.[0m[0mappend[0m[1;33m([0m[0mU[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     45 [1;33m            [0my[0m[

  0%|                                                                            | 1/31034 [00:50<435:00:36, 50.46s/it]

> [1;32m<ipython-input-2-777665a38689>[0m(42)[0;36m<module>[1;34m()[0m
[1;32m     40 [1;33m        [1;31m# the disjoint union of graphs (have to chek if a graph exists)[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     41 [1;33m        [1;32mif[0m [0mIsGraphDir[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 42 [1;33m            [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     43 [1;33m            [0mU[0m [1;33m=[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T345"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T129"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T237"[0m[1;33m][0m[1;33m,[0m [0mGi[0m[1;33m[[0m[1;34m"T147"[0m[1;33m][0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     44 [1;33m          

  0%|                                                                            | 2/31034 [01:03<339:25:48, 39.38s/it]

> [1;32m<ipython-input-2-777665a38689>[0m(43)[0;36m<module>[1;34m()[0m
[1;32m     41 [1;33m        [1;32mif[0m [0mIsGraphDir[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     42 [1;33m            [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 43 [1;33m            [0mU[0m [1;33m=[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T345"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T129"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T237"[0m[1;33m][0m[1;33m,[0m [0mGi[0m[1;33m[[0m[1;34m"T147"[0m[1;33m][0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     44 [1;33m            [0mG[0m[1;33m.[0m[0mappend[0m[1;33m([0m[0mU[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     45 [1;33m            [0my[0m[

  0%|                                                                      | 3/31034 [1:17:02<12024:49:53, 1395.04s/it]

> [1;32m<ipython-input-2-777665a38689>[0m(42)[0;36m<module>[1;34m()[0m
[1;32m     40 [1;33m        [1;31m# the disjoint union of graphs (have to chek if a graph exists)[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     41 [1;33m        [1;32mif[0m [0mIsGraphDir[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 42 [1;33m            [0mset_trace[0m[1;33m([0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     43 [1;33m            [0mU[0m [1;33m=[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T345"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T129"[0m[1;33m][0m[1;33m,[0m [0mnx[0m[1;33m.[0m[0mdisjoint_union[0m[1;33m([0m[0mGi[0m[1;33m[[0m[1;34m"T237"[0m[1;33m][0m[1;33m,[0m [0mGi[0m[1;33m[[0m[1;34m"T147"[0m[1;33m][0m[1;33m)[0m[1;33m)[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     44 [1;33m          

In [17]:
print("We have", len(y), "labels")
print("and", len(G), "graphs")

for label in LABELS+["other"]:
    print('{} has occurred {} times'.format(label, y.count(label)) )

We have 18796 labels
and 18796 graphs
classic has occurred 1392 times
rock has occurred 7402 times
pop has occurred 5140 times
folk has occurred 269 times
metal has occurred 701 times
jazz has occurred 1443 times
disco has occurred 1404 times
other has occurred 1045 times


In [18]:
from sklearn.model_selection import train_test_split
from grakel.utils import graph_from_networkx

# Transform networkx graphs to grakel representations
G = list(graph_from_networkx(G, node_labels_tag='note'))

# Splits the dataset into a training and a test set
G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.3, shuffle=True, stratify=y)



In [19]:
from grakel.kernels import WeisfeilerLehman, GraphletSampling
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


# Uses the WEisfeilerLehman kernel to generate the kernel matrices
gk = WeisfeilerLehman(n_iter=4, normalize=True)
K_train = gk.fit_transform(G_train)
K_test = gk.transform(G_test)

# Uses the SVM classifier to perform classification
clf = SVC(kernel="precomputed")
clf.fit(K_train, y_train)
y_pred = clf.predict(K_test)

# Computes and prints the classification accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy:", str(round(acc*100, 2)) + "%")

Accuracy: 39.48%


### Classification via Boosting

In this section we apply the simple classification for every graph and the descriptors retrieved from the HMD5 files seperately and then we combine the results with a boosting learning algorithm.

In [None]:
with open(SCORE_FILE) as f:
    # The json SCORE_FILE is a dict of ids from Million Dollar
    scores = json.load(f)
y = { k : list() for k in TONNETZEDICT.keys()}
G = { k : list() for k in TONNETZEDICT.keys()}
for msd_id in tqdm(scores.keys()):
    # open every directory with midi files and save file dir to var msd_dir
    msd_dir = msd_id_to_dirs(msd_id)
    # add the directory where graphs where saved
    graph_directory =  os.path.join(RESULTS_PATH, "lmd_graphs", msd_dir)
    # first check if the directory isn't empty
    if os.listdir(graph_directory):
        for key, tonnetz in TONNETZEDICT.items():
            for file in os.listdir(graph_directory):
                if file.endswith(key +".p"):
                    Gi = pickle.load( open( os.path.join(graph_directory, file), "rb" ) )
                    yi = pickle.load( open( os.path.join(graph_directory, file.split("_graph_" +key+".p")[0]+ "_label.p"), "rb" ) )
                    y[key].append(yi)
                    G[key].append(Gi)

In [None]:
from sklearn.model_selection import train_test_split
from grakel.kernels import WeisfeilerLehman, GraphletSampling
from sklearn.svm import SVC

G_train = dict()
G_test = dict()
K_train = dict() 
K_test = dict()
clf = dict()
y_pred = dict()

for key in TONNETZEDICT.keys():  
    # Transform networkx graphs to grakel representations
    G[key] = list(graph_from_networkx(G[key], node_labels_tag='note'))
    # do not shuffle the split
    G_train[key], G_test[key], y_train, y_test = train_test_split(G[key], y[key], test_size=0.3, shuffle=False, stratify=y)
    gk = WeisfeilerLehman(n_iter=1, normalize=True)
    K_train[key] = gk.fit_transform(G_train[key])
    K_test[key] = gk.transform(G_test[key])
    y_pred[key] = clf[key].predict(K_test[key])