In [None]:
!pip install librosa


In [38]:
import pandas as pd
import numpy as np
import librosa
import os
from glob import glob
import matplotlib.pyplot as plt
from glob import iglob

In [39]:
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Lambda,Conv1D, MaxPooling1D,Activation, Dense,BatchNormalization
from tensorflow.keras.models import Sequential

import pickle


In [40]:
DATA_AUDIO_DIR= './Padmaja/EdvancerClass/DeepLearning/Project2MusicGenreIdentification/audio'
TARGET_SR=8000
OUTPUT_DIR='./Padmaja/EdvancerClass/DeepLearning/Project2MusicGenreIdentification/output'
OUTPUT_DIR_TRAIN= os.path.join(OUTPUT_DIR,"train")
OUTPUT_DIR_TEST=os.path.join(OUTPUT_DIR,"test")
AUDIO_LENGTH=10000

In [41]:
os.makedirs('/Padmaja/EdvancerClass/DeepLearning/Project2MusicGenreIdentification/output/train')
os.makedirs('/Padmaja/EdvancerClass/DeepLearning/Project2MusicGenreIdentification/output/test')

In [42]:
class_ids={
    'blues':0,
    'classical':1,
    'country':2,
    'disco':3,
    'hiphop':4,
    'jazz':5,
    'metal':6,
    'pop':7,
    'reggae':8,
    'rock':9    
}
def extract_class_id(wav_filename):
    if 'blues' in wav_filename:
        return class_ids.get('blues')
    elif 'classical' in wav_filename:
        return class_ids.get('classical')
    elif 'country' in wav_filename:
        return class_ids.get('country')
    elif 'disco' in wav_filename:
        return class_ids.get('disco')
    elif 'hiphop' in wav_filename:
        return class_ids.get('hiphop')
    elif 'jazz' in wav_filename:
        return class_ids.get('jazz')
    elif 'metal' in wav_filename:
        return class_ids.get('metal')
    elif 'pop' in wav_filename:
        return class_ids.get('pop')
    elif 'reggae' in wav_filename:
        return class_ids.get('reggae')
    elif 'rock' in wav_filename:
        return class_ids.get('rock')
    else:
        return class_ids.get('unlabelled')

In [43]:
def read_audio_from_filename(filename, target_sr):
    audio, _ = librosa.load(filename, sr=target_sr, mono=True)
    audio = audio.reshape(-1, 1)
    return audio

In [44]:
def convert_data():
    for i, wav_filename in enumerate(iglob(os.path.join(DATA_AUDIO_DIR, '**/**.wav'), recursive=True)):
        class_id = extract_class_id(wav_filename)
        audio_buf = read_audio_from_filename(wav_filename, target_sr=TARGET_SR)
        # normalize mean 0, variance 1
        audio_buf = (audio_buf - np.mean(audio_buf)) / np.std(audio_buf)
        original_length = len(audio_buf)
        print(i, wav_filename, original_length, np.round(np.mean(audio_buf), 4), np.std(audio_buf))
        if original_length < AUDIO_LENGTH:
            audio_buf = np.concatenate((audio_buf, np.zeros(shape=(AUDIO_LENGTH - original_length, 1))))
            print('PAD New length =', len(audio_buf))
        elif original_length > AUDIO_LENGTH:
            audio_buf = audio_buf[0:AUDIO_LENGTH]
            print('CUT New length =', len(audio_buf))

        output_folder = OUTPUT_DIR_TRAIN
        if i // 10 == 0:
            output_folder = OUTPUT_DIR_TEST

        output_filename = os.path.join(output_folder, str(i) + '.pkl')

        out = {'class_id': class_id,
               'audio': audio_buf,
               'sr': TARGET_SR}
        w=open(output_filename,'wb')
        pickle.dump(out,w)
        w.close()

In [45]:
convert_data()

In [46]:
def m5(num_classes=5):
    print("Using model M5:")
    m=Sequential()
    m.add(Conv1D(128,
                input_shape=[AUDIO_LENGTH,1],
               kernel_size=80,
               strides=4,
               padding='same',
               kernel_initializer='glorot_uniform',
               kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4,strides=None))
    m.add(Conv1D(128,
                kernel_size=3,
                strides=1,
                padding='same',
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4,strides=None))
    m.add(Conv1D(256,
                kernel_size=3,
                strides=1,
                padding='same',
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4,strides=None))
    m.add(Conv1D(512,
                kernel_size=3,
                strides=1,
                padding='same',
                kernel_initializer='glorot_uniform',
                kernel_regularizer=regularizers.l2(l=0.0001)))
    m.add(BatchNormalization())
    m.add(Activation('relu'))
    m.add(MaxPooling1D(pool_size=4,strides=None))
    m.add(Lambda(lambda x: K.mean(x, axis=1)))   # Same as GAP for 1D Conv Layer
    m.add(Dense(num_classes, activation='softmax'))
    return m

In [47]:
def get_data(file_list):
    def load_into(_filename, _x, _y):
        with open(_filename, 'rb') as f:
            audio_element=pickle.load(f)
            _x.append(audio_element['audio'])
            _y.append(audio_element['class_id'])
            
    x, y = [], []
    for filename in file_list:
        load_into(filename,x,y)
    return np.array(x),np.array(y)

In [48]:
num_classes=5
model = m5(num_classes=num_classes)

if model is None:
    exit("Something went wrong!!")
    
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
print(model.summary())

train_files = glob(os.path.join(OUTPUT_DIR_TRAIN, '**.pkl'))
x_tr, y_tr = get_data(train_files)
y_tr = to_categorical(y_tr, num_classes=num_classes)

test_files = glob(os.path.join(OUTPUT_DIR_TEST, '**.pkl'))
x_te, y_te = get_data(test_files)
y_te = to_categorical(y_te, num_classes=num_classes)

print('x_tr.shape =', x_tr.shape)
print('y_tr.shape =', y_tr.shape)
print('x_te.shape =', x_te.shape)
print('y_te.shape =', y_te.shape)

# if the accuracy does not increase over 10 epochs, reduce the learning rate by half.
#reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.5, patience=10, min_lr=0.0001, verbose=1)
batch_size = 128
model.fit(x=x_tr,
          y=y_tr,
          batch_size=batch_size,
          epochs=400,
          verbose=1,
          shuffle=True,
          validation_data=(x_te, y_te))   #callbacks=[reduce_lr]

Using model M5:
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_16 (Conv1D)           (None, 2500, 128)         10368     
_________________________________________________________________
batch_normalization_16 (Batc (None, 2500, 128)         512       
_________________________________________________________________
activation_16 (Activation)   (None, 2500, 128)         0         
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 625, 128)          0         
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 625, 128)          49280     
_________________________________________________________________
batch_normalization_17 (Batc (None, 625, 128)          512       
_________________________________________________________________
activation_17 (Activation)   (None, 62

ValueError: Error when checking input: expected conv1d_16_input to have 3 dimensions, but got array with shape (0, 1)