In [7]:
import numpy as np
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Dropout, Flatten, LSTM
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, MaxPool1D, GaussianNoise, GlobalMaxPooling1D
from keras.layers import BatchNormalization
from keras.layers import LeakyReLU
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import confusion_matrix
import seaborn as sn
import os

Load in the preprocessed data

In [8]:
from save_utils import load_sliced_numpy_array

melspec_data = load_sliced_numpy_array('melspec_features')
labels = np.load('data/labels.npy')

print("Features:", melspec_data.shape)
print("Labels:", labels.shape)

Loaded 12 files:
    melspec_features_001.npy
    melspec_features_002.npy
    melspec_features_003.npy
    melspec_features_004.npy
    melspec_features_005.npy
    melspec_features_006.npy
    melspec_features_007.npy
    melspec_features_008.npy
    melspec_features_009.npy
    melspec_features_010.npy
    melspec_features_011.npy
    melspec_features_012.npy
Features: (4068, 33088)
Labels: (4068, 5)


Split the data into train and test set

In [12]:
data_split = 0.8
split_index = int(0.8*melspec_data.shape[0])

mel_train = melspec_data[:split_index,:]
mel_test = melspec_data[split_index:,:]

lab_train = labels[:split_index,:]
lab_test = labels[split_index:,:]

print(mel_train.shape)
print(mel_test.shape)

print(lab_train.shape)
print(lab_test.shape)

(3254, 33088)
(814, 33088)
(3254, 5)
(814, 5)


Normalize the data (3254, 64, 517)

In [19]:
def normalization(mel_train, mel_test):
    maximum = np.amax(mel_train)
    mel_train = mel_train/maximum
    mel_test = mel_test/maximum
    return (mel_train.astype(np.float32), mel_test.astype(np.float32))

mel_train_n, mel_test_n = normalization(mel_train, mel_test)

Reshape the melspec the models dimensions

In [21]:
n_mels = 64
# .reshape(n_mels, -1)

# Reshaping Mel-Spectrogram
def reshape_melspectogram(mel_train, mel_test):
    n, m = mel_train.shape
    mel_train = mel_train.reshape((n, n_mels, int(m/n_mels), 1))
    n, m = mel_test.shape
    mel_test = mel_test.reshape((n, n_mels, int(m/n_mels), 1))
    return mel_train, mel_test

mel_train_n, mel_test_n = reshape_melspectogram(mel_train_n, mel_test_n)

print(mel_train_n.shape)
print(mel_test_n.shape)

(3254, 64, 517, 1)
(814, 64, 517, 1)


Define the shot-chunk-cnn model

In [28]:
# Define the model
def shortchunckcnn(input_shape, output_shape):
    model = Sequential()
    model.add(Conv2D(8, (3,3), activation= 'relu', input_shape=input_shape, padding= 'same'))
    model.add(MaxPooling2D((4,4), padding= 'same'))
    model.add(Conv2D(16, (3,3), activation= 'relu', padding= 'same'))
    model.add(MaxPooling2D((4,4), padding= 'same'))
    model.add(Conv2D(32, (3,3), activation= 'relu', padding= 'same'))
    model.add(MaxPooling2D((4,4), padding= 'same'))
    model.add(Conv2D(64, (3,3), activation= 'relu', padding= 'same'))
    model.add(MaxPooling2D((4,4), padding= 'same'))
    model.add(Conv2D(64, (3,3), activation= 'relu', padding= 'same'))
    model.add(MaxPooling2D((4,4), padding= 'same'))
    model.add(Flatten())
    model.add(Dense(32, activation= 'relu'))
    model.add(Dense(output_shape, activation= 'softmax'))

    model.compile(optimizer= 'Adam', loss= 'categorical_crossentropy')
    model.summary()

    return model

model = shortchunckcnn(mel_train_n[0].shape, labels.shape[1])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 64, 517, 8)        80        
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 16, 130, 8)       0         
 2D)                                                             
                                                                 
 conv2d_11 (Conv2D)          (None, 16, 130, 16)       1168      
                                                                 
 max_pooling2d_6 (MaxPooling  (None, 4, 33, 16)        0         
 2D)                                                             
                                                                 
 conv2d_12 (Conv2D)          (None, 4, 33, 32)         4640      
                                                                 
 max_pooling2d_7 (MaxPooling  (None, 1, 9, 32)        

Train the defined model

In [30]:
checkpoint = ModelCheckpoint("models/short_chunk_cnn_{epoch:03d}.h5", save_freq=5)

epochs=3

model.fit(mel_train_n, lab_train, epochs=epochs, callbacks=[checkpoint], batch_size=32, verbose=1)

model.save("models/short_chunk_cnn_complete_{epochs}.h5".format(epochs=epochs))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [32]:
model = load_model("models/short_chunk_cnn_complete_{epochs}.h5".format(epochs=epochs))

# Training Accuracy
y_pred = model.predict(mel_train_n)
y_pred = np.argmax(y_pred, axis= -1)
y_true = np.argmax(lab_train, axis= -1)

correct = len(y_pred) - np.count_nonzero(y_pred - y_true)
acc = correct/ len(y_pred)
acc = np.round(acc, 4) * 100

print(acc)

51.51


In [None]:

print("Train Accuracy: ", correct, "/", len(y_pred), " = ", acc, "%")

# Testing Accuracy
y_pred = model.predict(mel_test)
y_pred = np.argmax(y_pred, axis= -1)
y_true = np.argmax(y_test, axis= -1)

correct = len(y_pred) - np.count_nonzero(y_pred - y_true)
acc = correct/ len(y_pred)
acc = np.round(acc, 4) * 100
print("Testing Accuracy", acc)

class_names = ["Blues", "Classical", "Country", "Disco", "Hiphop", "Jazz", "Metal", "Pop", "Reggae", "Rock"]
conf_mat = confusion_matrix(y_true, y_pred, normalize= 'true')
conf_mat = np.round(conf_mat, 2)

conf_mat_df = pd.DataFrame(conf_mat, columns= class_names, index= class_names)

plt.figure(figsize = (10,7), dpi = 200)
sn.set(font_scale=1.4)
sn.heatmap(conf_mat_df, annot=True, annot_kws={"size": 16}) # font size
plt.tight_layout()
plt.savefig(os.getcwd() + "/ensemble_mel_conf_mat1.png")