## CSYE7245 - Modern Music Genre Classification with large multi-class dataset
                                            Ashutosh Mahala, Xiaosui Zhang

In [15]:
import numpy as np
import pandas as pd
import theano
import theano.tensor as T
import keras
import urllib.request
import zipfile
import os.path

import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -- Keras Import
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.preprocessing import image
from keras.datasets import imdb
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU, SimpleRNN
from keras.layers import Activation, TimeDistributed, RepeatVector
from keras.callbacks import EarlyStopping, ModelCheckpoint

# -- sklearn Import
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.cross_validation import KFold, cross_val_score, train_test_split
from sklearn.svm import SVC

In [2]:
# Global variables
url_loc = 'https://os.unil.cloud.switch.ch/fma/fma_metadata.zip'
file_loc = 'C:/work/csye7245/Assignment4/fma_metadata.zip'
dir_unzip_loc = 'C:/work/csye7245/Assignment4/'

# Number of songs used in model
data_length = 5000

## 1. Data Preparation
## 1.1 Download and Extraction

[Raw FMA](http://freemusicarchive.org/) - Original Free Music Archive dataset 

[EPFL LTS2](https://github.com/mdeff/fma) - Preprocessed FMA dataset by LTS2 lab.

In [3]:
# Download zipped fma meta data
if not os.path.isfile(file_loc):
    urllib.request.urlretrieve(url_loc, file_loc)

In [4]:
# Unzip fma meta data
if not os.path.isdir(dir_unzip_loc):
    with zipfile.ZipFile(file_loc, 'r') as zip_ref:
        zip_ref.extractall(dir_unzip_loc)

## 1.2 Download and Extraction

In [5]:
df_tracks = pd.read_csv(dir_unzip_loc + "fma_metadata/tracks.csv", skiprows=1, nrows=data_length);
df_features = pd.read_csv(dir_unzip_loc + "fma_metadata/features.csv", skiprows=1, nrows=data_length);
df_genres = pd.read_csv(dir_unzip_loc + "fma_metadata/genres.csv", skiprows=0, nrows=data_length);

df_tracks = df_tracks.rename(columns={ df_tracks.columns[0]: "track_id" })
df_features = df_features.rename(columns={ df_features.columns[0]: "track_id" })
1

  interactivity=interactivity, compiler=compiler, result=result)


1

In [6]:
df_tracks.head()

Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information.1,interest,language_code,license,listens.1,lyricist,number,publisher,tags.2,title.1
0,track_id,,,,,,,,,,...,,,,,,,,,,
1,2,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,4656.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293.0,,3.0,,[],Food
2,3,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1470.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514.0,,4.0,,[],Electric Ave
3,5,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,,1933.0,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151.0,,6.0,,[],This World
4,10,0.0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4.0,6.0,,47632.0,,...,,54881.0,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135.0,,1.0,,[],Freeway


In [7]:
df_features.head()

Unnamed: 0,track_id,kurtosis,kurtosis.1,kurtosis.2,kurtosis.3,kurtosis.4,kurtosis.5,kurtosis.6,kurtosis.7,kurtosis.8,...,std.70,std.71,std.72,kurtosis.73,max.73,mean.73,median.73,min.73,skew.73,std.73
0,number,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,4.0,5.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,track_id,,,,,,,,,,...,,,,,,,,,,
2,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
3,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
4,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,0.666617,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861


In [8]:
df_genres.head()

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


### Data Explanation

1. Track.csv holds the metadata of songs including author, language, title and also the label.
2. Features.csv holds the extracted acoustic features from the audio of the song.
3. Genres.csv holds the target variable genre statistics

We can relate rows from different table using track number.

## 1.3 Merge Tables

In [9]:
# Get only the genre and track_id from df_tracks_only_genre
df_tracks_only_genre = df_tracks[['genre_top','track_id']]
df_tracks_only_genre_with_id = pd.merge(df_tracks_only_genre, df_genres, left_on="genre_top", right_on="title", how='inner')
df_tracks_only_genre_with_id = df_tracks_only_genre_with_id[['genre_id','track_id']]
df_tracks_only_genre_with_id.head()

Unnamed: 0,genre_id,track_id
0,21,2
1,21,3
2,21,5
3,21,134
4,21,583


In [10]:
# Merge
df = pd.merge(df_tracks_only_genre_with_id, df_features, on="track_id", how='inner')
df.head()

Unnamed: 0,genre_id,track_id,kurtosis,kurtosis.1,kurtosis.2,kurtosis.3,kurtosis.4,kurtosis.5,kurtosis.6,kurtosis.7,...,std.70,std.71,std.72,kurtosis.73,max.73,mean.73,median.73,min.73,skew.73,std.73
0,21,2,7.180653,5.230309,0.249321,1.34762,1.482478,0.531371,1.481593,2.691455,...,0.054125,0.012226,0.012111,5.75889,0.459473,0.085629,0.071289,0.0,2.089872,0.061448
1,21,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,...,0.063831,0.014212,0.01774,2.824694,0.466309,0.084578,0.063965,0.0,1.716724,0.06933
2,21,5,0.527563,-0.077654,-0.27961,0.685883,1.93757,0.880839,-0.923192,-0.927232,...,0.04073,0.012691,0.014759,6.808415,0.375,0.053114,0.041504,0.0,2.193303,0.044861
3,21,134,0.918445,0.674147,0.577818,1.281117,0.933746,0.078177,1.199204,-0.175223,...,0.058766,0.016322,0.015819,4.731087,0.419434,0.06437,0.050781,0.0,1.806106,0.054623
4,21,583,-0.028032,0.509161,0.067235,0.465656,-0.448363,1.086958,-0.103174,-0.247502,...,0.058434,0.018185,0.017061,2.111479,0.270508,0.03676,0.025879,0.000977,1.296176,0.033785


### Prepare the train and test data with 10-fold cross validation

In [55]:
df = df[1:][np.isfinite(df['genre_id'])]
df["genre_id"]=pd.to_numeric(df["genre_id"], downcast='integer')

train, test = train_test_split(df, test_size=0.1)

X_train = train.iloc[:, range(2,df.shape[1])].values
Y_train = train.iloc[:, [0]].values
Y_train = np_utils.to_categorical(Y_train, 40)

X_test = test.iloc[:, range(2,df.shape[1])].values
Y_test = test.iloc[:, [0]].values

# Make the numeric genre id into categroical outputs 
Y_test = np_utils.to_categorical(Y_test, 40)
1

  """Entry point for launching an IPython kernel.


1

In [60]:
X_train.shape

(846, 40)

In [62]:
Y_train.shape

(846, 40)

## 3. Methods

We will apply random forest, svm and nerual network(RNN) to the data we have. A 10-fold cross validation will be used and the optimization will be based on grid search with multiple hyperparameters.

## 3.1 Random Forest

In [12]:
n_max = 0
msl_max = 0
mf_max = 0
acc_max = 0

print ("Processing FMA data upon Random Forest, hyperparameters are [n_estimators, min_samples_leaf, max_features]")

for n in [100, 500]:
    for msl in [5, 20]:
        for mf in [3, 10]:
            # Build the model
            rf = RandomForestClassifier(n_estimators = n,
                                        min_samples_leaf=msl,
                                        max_features=mf,                                        
                                        criterion='gini',)
            m = rf.fit(X_train, Y_train)
            Y_pred = m.predict(X_test)
            acc = metrics.accuracy_score(Y_test, Y_pred)
            print("Random forest with [%r, %r, %r] gets accuracy %r " % (
                n, msl, mf, acc))
            if acc_max < acc:
                  acc_max = acc
                  n_max = n
                  msl_max = msl
                  mf_max = mf
                

print("Best random forest with [%r, %r, %r] gets accuracy %r " % (
    n_max, msl_max, mf_max, acc_max))

Processing FMA data upon Random Forest, hyperparameters are [n_estimators, min_samples_leaf, max_features]
Random forest with [100, 5, 3] gets accuracy 0.25263157894736843 
Random forest with [100, 5, 10] gets accuracy 0.3157894736842105 
Random forest with [100, 20, 3] gets accuracy 0.2 
Random forest with [100, 20, 10] gets accuracy 0.21052631578947367 
Random forest with [500, 5, 3] gets accuracy 0.23157894736842105 
Random forest with [500, 5, 10] gets accuracy 0.3157894736842105 
Random forest with [500, 20, 3] gets accuracy 0.22105263157894736 
Random forest with [500, 20, 10] gets accuracy 0.22105263157894736 
Best random forest with [100, 5, 10] gets accuracy 0.3157894736842105 


## 3.2 Neural Network - RNN

In [13]:
# Reshape the data for RNN
X_train_rnn = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

Y_train_rnn = np.reshape(Y_train, (Y_train.shape[0], 1, Y_train.shape[1]))
Y_test_rnn = np.reshape(Y_test, (Y_test.shape[0], 1, Y_test.shape[1]))

print(X_train_rnn.shape)
print(X_test_rnn.shape)
print(Y_train_rnn.shape)
print(Y_test_rnn.shape)

(852, 1, 518)
(95, 1, 518)
(852, 1, 40)
(95, 1, 40)


In [14]:
do_max = 0
bias_max = ""
act_max = ""
acc_max = 0
batch_size = 50

print ("Processing FMA data upon RNN, hyperparameters are [dropout, bias_initializer, activation]")

for do in [0.3, 0.5, 0.7]:
        for bias in ["zeros", "Ones", "RandomNormal"]:
            for act in ["sigmoid", "tanh", "relu"]:
                model = Sequential()
                model.add(SimpleRNN(input_dim=518, output_dim=40, return_sequences=True))  
                model.add(Dropout(do))
                model.add(Dense(40, bias_initializer=bias))                
                model.add(Activation(act))
                model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
                model.summary()
                # train the data without std output
                hist = model.fit(X_train_rnn, Y_train_rnn, batch_size=batch_size, epochs=20, 
                          validation_data=(X_test_rnn, Y_test_rnn), verbose=0)
                print("RNN with [%r, %r, %r] gets accuracy %r " % (do, bias, act, acc))
                # get last accuracy
                acc = hist.history.get('acc')[-1]
                if acc_max < acc:
                    acc_max = acc
                    do_max = do
                    bias_max = bias
                    act_max = act

print("Best FMA with [%r, %r, %r] gets accuracy %r " % (
    do_max, bias_max, act_max, acc_max))

Processing FMA data upon RNN, hyperparameters are [dropout, bias_initializer, activation]


  del sys.path[0]
  del sys.path[0]


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, None, 40)          22360     
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 40)          0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 40)          1640      
_________________________________________________________________
activation_1 (Activation)    (None, None, 40)          0         
Total params: 24,000
Trainable params: 24,000
Non-trainable params: 0
_________________________________________________________________
RNN with [0.3, 'zeros', 'sigmoid'] gets accuracy 0.22105263157894736 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (None, None, 40)          22360     
__

RNN with [0.3, 'RandomNormal', 'relu'] gets accuracy 0.011737088939533548 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_10 (SimpleRNN)    (None, None, 40)          22360     
_________________________________________________________________
dropout_10 (Dropout)         (None, None, 40)          0         
_________________________________________________________________
dense_10 (Dense)             (None, None, 40)          1640      
_________________________________________________________________
activation_10 (Activation)   (None, None, 40)          0         
Total params: 24,000
Trainable params: 24,000
Non-trainable params: 0
_________________________________________________________________
RNN with [0.5, 'zeros', 'sigmoid'] gets accuracy 0.34624413205004634 
_________________________________________________________________
Layer (type)                 Output Shape              Para

RNN with [0.5, 'RandomNormal', 'relu'] gets accuracy 0.011737088939533548 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_19 (SimpleRNN)    (None, None, 40)          22360     
_________________________________________________________________
dropout_19 (Dropout)         (None, None, 40)          0         
_________________________________________________________________
dense_19 (Dense)             (None, None, 40)          1640      
_________________________________________________________________
activation_19 (Activation)   (None, None, 40)          0         
Total params: 24,000
Trainable params: 24,000
Non-trainable params: 0
_________________________________________________________________
RNN with [0.7, 'zeros', 'sigmoid'] gets accuracy 0.31103286391972373 
_________________________________________________________________
Layer (type)                 Output Shape              Para

RNN with [0.7, 'RandomNormal', 'relu'] gets accuracy 0.01995305119720703 
Best FMA with [0.3, 'zeros', 'relu'] gets accuracy 0.42488263190632136 


## 3.3 SVM

In [84]:
X_train = train.iloc[:, range(2,df.shape[1])].values
Y_train = train.iloc[:, [0]].values

X_test = test.iloc[:, range(2,df.shape[1])].values
Y_test = test.iloc[:, [0]].values
1

  """Entry point for launching an IPython kernel.


1

In [115]:
kel_max = ""
C_max = 0
tol_max = 0
acc_max = 0

print ("Processing FMA data with SVM, hyperparameters are [kernel, C, tol]")

for kel in ["linear","rbf", "sigmoid"]:
    for c in [1, 3, 10]:
        for tol in [1e-2, 1e-3, 1e-4]:
            svm_model_linear = SVC(kernel = kel, C = c, tol = tol).fit(X_train, Y_train)
            acc = svm_model_linear.score(X_test, Y_test)
            print("SVM with [%r, %r, %r] gets accuracy %r " % (kel, c, tol, acc))
            if acc_max < acc:
                acc_max = acc
                kel_max = kel
                C_max = c
                tol_max = tol

print("Best SVM with [%r, %r, %r] gets accuracy %r " % (
    kel_max, C_max, tol_max, acc_max))

Processing FMA data with SVM, hyperparameters are [kernel, C, tol]


  y = column_or_1d(y, warn=True)


SVM with ['linear', 1, 0.01] gets accuracy 0.6914893617021277 
SVM with ['linear', 1, 0.001] gets accuracy 0.6914893617021277 
SVM with ['linear', 1, 0.0001] gets accuracy 0.6914893617021277 
SVM with ['linear', 3, 0.01] gets accuracy 0.6914893617021277 
SVM with ['linear', 3, 0.001] gets accuracy 0.6914893617021277 
SVM with ['linear', 3, 0.0001] gets accuracy 0.6914893617021277 
SVM with ['linear', 10, 0.01] gets accuracy 0.6914893617021277 
SVM with ['linear', 10, 0.001] gets accuracy 0.6914893617021277 
SVM with ['linear', 10, 0.0001] gets accuracy 0.6914893617021277 
SVM with ['rbf', 1, 0.01] gets accuracy 0.39361702127659576 
SVM with ['rbf', 1, 0.001] gets accuracy 0.39361702127659576 
SVM with ['rbf', 1, 0.0001] gets accuracy 0.39361702127659576 
SVM with ['rbf', 3, 0.01] gets accuracy 0.39361702127659576 
SVM with ['rbf', 3, 0.001] gets accuracy 0.39361702127659576 
SVM with ['rbf', 3, 0.0001] gets accuracy 0.39361702127659576 
SVM with ['rbf', 10, 0.01] gets accuracy 0.393617