## Springboard Data Science Track Capstone Project 2
### Music Genre Classification from Audio Samples
### by Morgan Fry
### Extended Modeling -- Time Distributed CNN

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt  
%matplotlib inline
import pickle

#import keras
#from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape, Dropout
#from keras.models import Sequential

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import tensorflow_io as tfio

from sklearn.utils import shuffle
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)
else:
    print('no gpu')


In [3]:
#features = pd.read_csv('data/features.csv', index_col=0, header=[0, 1, 2])
fma_single = pickle.load(open("saved/fma_single.p", "rb"))

In [4]:
fma_multi=pickle.load(open("saved/fma_multi.p","rb"))

In [5]:
fma_single

Unnamed: 0_level_0,split,subset,filepath,genre_top
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,training,small,data/fma_large/000/000002.mp3,Hip-Hop
3,training,medium,data/fma_large/000/000003.mp3,Hip-Hop
5,training,small,data/fma_large/000/000005.mp3,Hip-Hop
10,training,small,data/fma_large/000/000010.mp3,Pop
134,training,medium,data/fma_large/000/000134.mp3,Hip-Hop
...,...,...,...,...
155315,training,large,data/fma_large/155/155315.mp3,Rock
155316,training,large,data/fma_large/155/155316.mp3,Rock
155317,training,large,data/fma_large/155/155317.mp3,Rock
155318,training,large,data/fma_large/155/155318.mp3,Rock


For the baseline model we are going to use the 'small' set of 8000 tracks in 8 classes, and train the model on the mfccs of the tracks which we extracted earlier.

In [6]:
#load mfccs
mfcc_df=pickle.load(open("saved/mfcc_small.p","rb"))

In [7]:
mfcc_df.shape

(7997, 23232)

In [8]:
mfcc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23222,23223,23224,23225,23226,23227,23228,23229,23230,23231
2,-299.542053,-202.513412,-159.976059,-115.456413,-80.415161,-67.523239,-57.689404,-65.46479,-84.703659,-96.179939,...,-5.861199,-4.460218,-6.887738,-7.665272,6.717929,12.76914,11.439809,9.167293,7.916427,5.665779
5,-324.059723,-227.421249,-186.883606,-158.489868,-94.902466,-95.517578,-106.146324,-70.498611,-45.091522,-41.156029,...,-6.403229,-4.675876,5.932048,19.163589,24.13835,12.146925,1.931949,-0.370714,1.137535,-0.625799
10,-71.376122,-32.29673,-29.356266,-38.037708,-44.881695,-45.910717,-31.081173,8.275231,28.077381,6.006516,...,9.814901,0.40905,-6.546686,-4.728213,-2.329597,4.121199,8.3372,8.429386,10.133602,11.033691
140,-561.32428,-523.545837,-449.814423,-342.445557,-306.727356,-308.809631,-307.733276,-317.345245,-362.774811,-419.306946,...,1.818927,3.912658,4.593844,2.139801,-4.394735,-5.267244,-6.697974,-5.78347,-5.886441,-3.70659
141,-464.980743,-371.434326,-330.917175,-342.813904,-362.01355,-368.364441,-361.755188,-358.438049,-365.66864,-375.68573,...,0.601643,-0.99506,0.14623,3.676285,-1.177926,-2.822582,4.884428,3.736303,2.701882,-2.805002


In [9]:
mfcc_df.replace([np.inf, -np.inf], np.nan)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23222,23223,23224,23225,23226,23227,23228,23229,23230,23231
2,-299.542053,-202.513412,-159.976059,-115.456413,-80.415161,-67.523239,-57.689404,-65.464790,-84.703659,-96.179939,...,-5.861199,-4.460218,-6.887738,-7.665272,6.717929,12.769140,11.439809,9.167293,7.916427,5.665779
5,-324.059723,-227.421249,-186.883606,-158.489868,-94.902466,-95.517578,-106.146324,-70.498611,-45.091522,-41.156029,...,-6.403229,-4.675876,5.932048,19.163589,24.138350,12.146925,1.931949,-0.370714,1.137535,-0.625799
10,-71.376122,-32.296730,-29.356266,-38.037708,-44.881695,-45.910717,-31.081173,8.275231,28.077381,6.006516,...,9.814901,0.409050,-6.546686,-4.728213,-2.329597,4.121199,8.337200,8.429386,10.133602,11.033691
140,-561.324280,-523.545837,-449.814423,-342.445557,-306.727356,-308.809631,-307.733276,-317.345245,-362.774811,-419.306946,...,1.818927,3.912658,4.593844,2.139801,-4.394735,-5.267244,-6.697974,-5.783470,-5.886441,-3.706590
141,-464.980743,-371.434326,-330.917175,-342.813904,-362.013550,-368.364441,-361.755188,-358.438049,-365.668640,-375.685730,...,0.601643,-0.995060,0.146230,3.676285,-1.177926,-2.822582,4.884428,3.736303,2.701882,-2.805002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154308,-313.429077,-285.330048,-283.423401,-287.088348,-289.263672,-184.150192,-41.746201,-12.892635,-55.076210,-115.045479,...,10.231204,11.211655,0.058539,-11.264427,-6.596434,4.120525,6.856134,5.343898,2.773130,1.698187
154309,-486.301819,-482.713928,-496.109802,-501.279755,-502.606598,-506.784668,-509.464905,-310.994598,-145.997116,-127.237030,...,-3.249464,-10.734337,-11.473764,-8.660153,-7.335606,-11.601185,-16.454685,-15.517137,-11.830728,-11.240954
154413,-155.809250,-137.494919,-114.974205,-140.923645,-162.126450,-160.300812,-160.252792,-164.502640,-163.036118,-166.671722,...,-16.811752,-17.902248,-22.706905,-22.197693,-24.581553,-25.488754,-21.233810,-23.033970,-21.506649,-22.316118
154414,-157.556030,-162.392120,-172.492035,-185.008377,-195.222931,-197.421844,-194.316422,-151.761810,-53.744122,0.694292,...,-14.949778,-13.652007,-14.656978,-12.779702,-15.503021,-17.758909,-18.673512,-14.957970,-11.633366,-14.272188


In [10]:
mfcc_df.fillna(method='ffill',inplace=True)

In [11]:
#use the 8000 track balanced subset for baseline modeling

drop3=pickle.load(open("saved/drop3.p","rb"))
fma_single.drop(drop3, inplace=True)
subset = fma_single.index[fma_single['subset'] == 'small']

In [12]:
fma_small=fma_single.loc[subset]

In [13]:
mfcc_sub=mfcc_df.loc[subset]

In [14]:
#use the built in train/test/validation split in case we want to compare to other models over this dataset
train = fma_small.index[fma_small['split'] == 'training']
val = fma_small.index[fma_small['split'] == 'validation']
test = fma_small.index[fma_small['split'] == 'test']

In [15]:
print(len(train),len(val),len(test))

6310 793 794


In [16]:
#preprocessing single label

#enc=MultiLabelBinarizer()
enc=LabelEncoder()
labels=fma_small['genre_top']
#labels=y_df

# Split in training, validation and testing sets.

#single label
y_train = enc.fit_transform(labels[train])
y_val = enc.transform(labels[val])
y_test = enc.transform(labels[test])
#y_train = track_sub['label']

X_train = mfcc_sub.loc[train].values
X_val = mfcc_sub.loc[val].values
X_test = mfcc_sub.loc[test].values
    
X_train, y_train = shuffle(X_train, y_train, random_state=42)

In [17]:
# Standardize features
scaler = StandardScaler(copy=False)
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [18]:
X_train.shape

(6310, 23232)

In [19]:
keras.backend.clear_session()

#define CNN-LSTM
model = models.Sequential()

model.add(layers.Reshape((968,24,1),input_shape=(X_train.shape[1],)))
model.add(layers.TimeDistributed(layers.Conv1D(64, 3, activation='relu')))
model.add(layers.TimeDistributed(layers.MaxPooling1D(2)))
model.add(layers.TimeDistributed(layers.Conv1D(128, 3, activation='relu')))

model.add(layers.TimeDistributed(layers.Conv1D(128, 3, activation='relu')))
model.add(layers.TimeDistributed(layers.MaxPooling1D(2)))
model.add(layers.TimeDistributed(layers.Flatten()))

model.add(layers.LSTM(units=64, dropout=0.2, return_sequences=False))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(.2))
model.add(layers.Dense(8, activation='softmax'))

loss=tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate=0.005, momentum=0.001, nesterov=True)
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=True)
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape (Reshape)            (None, 968, 24, 1)        0         
_________________________________________________________________
time_distributed (TimeDistri (None, 968, 22, 64)       256       
_________________________________________________________________
time_distributed_1 (TimeDist (None, 968, 11, 64)       0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 968, 9, 128)       24704     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 968, 7, 128)       49280     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 968, 3, 128)       0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, 968, 384)          0

In [20]:
%%time
with tf.device('/GPU:0'):
    history=model.fit(X_train,
              y_train,
              epochs=500,
              batch_size=16,
              validation_data=(X_val, y_val))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500


Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 1

Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 

Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 314/500
Epoch 315/500
Epoch 316/500
Epoch 317/500
Epoch 318/500
Epoch 319/500
Epoch 320/500
Epoch 321/500
Epoch 322/500
Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 

Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 

Epoch 498/500
Epoch 499/500
Epoch 500/500
CPU times: user 5h 36min 49s, sys: 14min 57s, total: 5h 51min 46s
Wall time: 5h 50min 16s


We can see that the validation and training set results diverge when the model achieves about 45% accuracy over the validation set, just as with the CNN model.

In [24]:
#make clear this is test set
y_pred=model.predict_classes(X_test)
print('Classification Report (test set)')
print(classification_report(y_test,y_pred))

Classification Report (test set)
              precision    recall  f1-score   support

           0       0.43      0.60      0.50        96
           1       0.29      0.22      0.25        98
           2       0.28      0.12      0.17       100
           3       0.58      0.57      0.58       100
           4       0.26      0.45      0.33       100
           5       0.50      0.21      0.30       100
           6       0.21      0.22      0.21       100
           7       0.41      0.50      0.45       100

    accuracy                           0.36       794
   macro avg       0.37      0.36      0.35       794
weighted avg       0.37      0.36      0.35       794



Over the set of:  
* 8 classes
* 1000 samples each class

The CNN-LSTM model shows a modest improvement over both the CNN and LSTM models

| Model | Accuracy(all classes) | Train Time |
| --- | --- | --- |
| CNN | .38 | 15m |
| LSTM | .34 | 90m |
| CNN-LSTM | .36 | 5m |
