In [1]:
"""
    Article:
    http://cs229.stanford.edu/proj2011/HaggbladeHongKao-MusicGenreClassification.pdf
    Content:
    1. Downloading Marsyas (search for the link in Telegram)
    2. Getting the middle of the song(50%)
    3. Getting MFCC from the each song and cutting down the higher frequences.
    After the cuttin the MFCC.shape equals (15, N), where N is the number of fragments
    4. Getting np.means(axis = 1) and the covariation matrix
    5. We have features now!
    6. Metrics: Kullback-Lieber (KL) Divergence
    (TODO: understand it from the scientific point of view)
    7. I suggest keeping the data in the next form:
        -- the first 15 values is the vector of means
        -- the next 225 values is the ravel() of the cov_matrix

"""


import librosa
import numpy as np
from numpy.linalg import det
from numpy.linalg import inv
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import math

def getFeatureMFCC(song):
    """
    Gets features from the song and writes it in array
    
    """

    # MFCC cut
    MFCC = librosa.feature.mfcc(song)[:15, 1:]
    
    means = np.mean(MFCC, axis = 1)
    cov_matrix = np.cov(MFCC, rowvar = True)    

    return np.append(means, cov_matrix.ravel())

def QuadraticForm(A, x, y):
    return np.dot(np.dot(A, x), y)

def Divergence_KL(song_p, song_q):    
    
    # Getting features
    means_p = song_p[0 : 15]
    cov_p = song_p[15 :].reshape(15, 15)
    means_q = song_q[0 : 15]
    cov_q = song_q[15 :].reshape(15, 15)

    inv_cov_q = inv(cov_q)
    means_diff = means_p - means_q
    return 0.5 * (math.log(det(cov_q) / det(cov_p)) +\
                  np.trace(np.dot(inv_cov_q, cov_p)) +\
                  QuadraticForm(inv_cov_q, means_diff, means_diff) - 15)
                  

def Distance_KL(song_p, song_q):
    """
    Initializes the metrics
    
    """
    
    return Divergence_KL(song_p, song_q) + Divergence_KL(song_q, song_p)

def getData(where_to, genre, range_tuple):
    for i in range(range_tuple):
        if i < 10:
            path = "./" + genre + "/" + genre + ".0000" + str(i) + ".au"        
        else:
            path = "./" + genre + "/" + genre + ".000" + str(i) + ".au"

        song = librosa.load(path)[0]
        where_to.append(getFeatureMFCC(song))

In [19]:
data = list()
getData(data, "hiphop", 30)
getData(data, "rock", 30)
getData(data, "classical", 30)
data = np.array(data)
y = np.append([np.zeros(30), 1 + np.zeros(30)], 2 +  np.zeros(30))

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0)
knn = KNeighborsClassifier(n_neighbors = 5, metric=Divergence_KL)
knn.fit(X_train, y_train)
accuracy_score(knn.predict(X_test), y_test)

0.8148148148148148

In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_train, knn.predict(X_train), target_names = ["hiphop", "rock", "classical"]))

             precision    recall  f1-score   support

     hiphop       0.66      1.00      0.79        21
       rock       1.00      0.60      0.75        20
  classical       1.00      0.86      0.93        22

avg / total       0.89      0.83      0.83        63



In [11]:
X_train.shape

(63, 240)