In [10]:
### Load necessary libraries ###
import glob
import os
import librosa
import librosa.display
import numpy as np
#import cupy as cp
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix

import tensorflow as tf
from tensorflow import keras

ModuleNotFoundError: No module named 'cupy'

# Global Configuration variables

In [8]:
# Debug and test variables
REDUCED_MODE = 1  # Do not process all the audio in the dataset (for faster execution during script development)
REDUCED_MODE_AUDIONUM = 100 # Number of dataset audios per folder to process in the reduced-mode.

# Audio processing variables
AUDIO_SR = 44100 # Audio sampling in Hertz
AUDIO_N_FFT = 2048 # Samples per FFT window
AUDIO_WINSIZE = 1024
AUDIO_HOP_LEN = 512
AUDIO_N_MELS = 60
AUDIOSEG_SIZE = 41 # windows per subAudio / segment
AUDIOSEG_OVERLAP = 0.5 # normalized


# Methods for preprocessing and feature extraction

In [None]:
# Divide an audio file into shorter parts.
def audioSegmentation(audioData, subAudioSize, overlapFactor = 0.5):
    start = 0
    while start < len(audioData):
        yield int(start), int(start + subAudioSize)
        start += ( subAudioSize // (1/(1-overlapFactor)) )     

In [25]:
### Define helper functions ###
def extractSoundFeature(audiosPath,audiosSubPaths,audiosExtension="*.wav", n_mels=AUDIO_N_MELS, n_windows=AUDIOSEG_SIZE, hopLength = AUDIO_HOP_LEN):
    
    subAudioSize = hopLength * (n_windows - 1)
    features, classes = [], []
    
    # Map all audio samples paths inside the folder
    allAudiosPath = glob.glob(os.path.join(audiosPath, audiosSubPath, audiosExtension));

    # Iterate and extract features from each audio
    idx = 0
    for audioPath in allAudiosPath:
        # Early stop for reduced mode
        if REDUCED_MODE==1 and idx == REDUCED_MODE_AUDIONUM:
            break;

        print('Processing sample: ', idx)
        idx=idx+1;
        
        subAudioLogSpect = []
        subAudioClass = []

        # Extract '.wav' audio to an 1-D array
        audioData, sr = librosa.load(audioPath, sr = AUDIO_SR)

        # Extract audio classification from file name
        audioClass = int(audioPath.split('/')[2].split('-')[1])

        # Loop and extract all audio segments
        for (start,end) in audioSegmentation(audioData, subAudioSize):
            
            if(len(audioData[start:end]) == subAudioSize):
                signal = audioData[start:end]
                melspec = librosa.feature.melspectrogram(y=signal,n_mels=n_mels)
                #print('melspec shape: ', np.shape(melspec))


                logspec = librosa.amplitude_to_db(melspec)
                #print('logspec shape: ', np.shape(logspec))


                logspec = logspec.T.flatten()[:, np.newaxis].T
                #print('logspec aft flatten shape: ', np.shape(logspec))


                subAudioLogSpect.append(logspec)
                #print('subAudioLogSpect shape: ', np.shape(subAudioLogSpect))
                subAudioClass.append(audioClass)
            
        subAudioLogSpect = np.asarray(subAudioLogSpect).reshape(len(subAudioLogSpect),n_mels,n_windows,1)
        #print('subAudioLogSpect shape final : ', np.shape(subAudioLogSpect))    

        segment_features = np.concatenate( (subAudioLogSpect, np.zeros(np.shape(subAudioLogSpect))), axis=3 )
        #print('segment_features shape final: ', np.shape(segment_features)) # add an array of zeros in the last dimension

            
        for i in range(len(segment_features)): 
            segment_features[i, :, :, 1] = librosa.feature.delta(segment_features[i, :, :, 0]) #Calculate delta / derivative of axis 1
        
        if len(segment_features) > 0: # if not empty, concatenate in features / classes array
            features.append(segment_features)
            classes.append(subAudioClass)
    
        print('features shape final: ', np.shape(features))
    
    return features, classes

# CNN Model Definition

In [9]:
### Define convolutional network architecture ###
def CNN_modelDefinition():
    keras.backend.clear_session()
    
    num_filters = [24,32,64,128] 
    pool_size = (2, 2) 
    kernel_size = (3, 3)  
    input_shape = (60, 41, 2)
    num_classes = 10
    
    
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(24, kernel_size, padding="same", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))

    model.add(keras.layers.Conv2D(32, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(64, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(128, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  

    model.add(keras.layers.GlobalMaxPooling2D())
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))

    model.compile(optimizer=keras.optimizers.Adam(1e-4), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=["accuracy"])
    return model

# Script Execution

In [26]:
# Pre-process and extract feature from the data
audiosPath = 'UrbanSounds8K/audio/'
save_dir = "UrbanSounds8K/processed/"
folds = audiosSubPaths = np.array(['fold1','fold2','fold3','fold4', 'fold5','fold6','fold7','fold8', 'fold9','fold10'])
for audiosSubPath in audiosSubPaths:
    features, labels = extractSoundFeature(audiosPath,audiosSubPath)
    #np.savez("{0}{1}".format(save_dir, audiosSubPath), features=features, labels=labels);

Processing sample:  0
features shape final:  (1, 16, 60, 41, 2)
Processing sample:  1
features shape final:  (2, 16, 60, 41, 2)
Processing sample:  2
features shape final:  (3, 16, 60, 41, 2)
Processing sample:  3
features shape final:  (4,)
Processing sample:  4
features shape final:  (5,)
Processing sample:  5
features shape final:  (6,)
Processing sample:  6
features shape final:  (7,)
Processing sample:  7
features shape final:  (8,)
Processing sample:  8
features shape final:  (9,)
Processing sample:  9
features shape final:  (10,)
Processing sample:  10
features shape final:  (11,)
Processing sample:  11
features shape final:  (12,)
Processing sample:  12
features shape final:  (13,)
Processing sample:  13
features shape final:  (14,)
Processing sample:  14
features shape final:  (15,)
Processing sample:  15
features shape final:  (16,)
Processing sample:  16
features shape final:  (17,)
Processing sample:  17
features shape final:  (18,)
Processing sample:  18
features shape fin

In [11]:
### Train and evaluate via 10-Folds cross-validation ###
accuracies = []
folds = np.array(['fold1','fold2','fold3','fold4',
                  'fold5','fold6','fold7','fold8',
                  'fold9','fold10'])
load_dir = "UrbanSounds8K/processed/"
kf = KFold(n_splits=10)

In [12]:
idx = 0

for train_index, test_index in kf.split(folds):
    idx= idx+1
    
    print('--> Starting a loop!')

    x_train, y_train = [], []

    # ---------------- STEP 1 ----------------
    # Load training data from 9 of 10 folders 
    # Loop through all training folders and gather data in single feature array
    for idxTrainFolder in train_index:
        print('-> Getting data from TRAINING folder ', idxTrainFolder+1)
        # Read pre-saved features or segments of an audio file (pre-processed)
        train_data = np.load("{0}/{1}.npz".format(load_dir,folds[idxTrainFolder]), allow_pickle=True)

        # Get the 'features' and 'labels' from current train folder
        features = np.concatenate(train_data["features"], axis=0) 
        labels = np.concatenate(train_data["labels"], axis=0)

        # Append all the 'features' and 'labels' train datasets 
        # in a single list containing all train folders data
        x_train.append(features)
        y_train.append(labels)

    # Merge all separate feature datasets in a single one (as if it was a single 'big' folder)
    x_train = np.concatenate(x_train, axis = 0).astype(np.float32)
    y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
    
    # ---------------- STEP 2 ----------------
    # Load test data from 1 of 10 folders 
    # Load test data from the test folder
    print('-> Getting data from TESTING folder ', test_index+1)
    test_data = np.load("{0}/{1}.npz".format(load_dir, folds[test_index][0]), allow_pickle=True)
    x_test = test_data["features"]
    y_test = test_data["labels"]

    model = CNN_modelDefinition()
    
    print('--> Fitting model!')
    
    model.fit(x_train, y_train, epochs = 5, batch_size = 24, verbose = 1)
    
    # evaluate on test set/fold
    y_true, y_pred = [], []
    for x, y in zip(x_test, y_test):
        # average predictions over segments of a sound clip
        avg_p = np.argmax(np.mean(model.predict(x), axis = 0))
        y_pred.append(avg_p) 
        # pick single label via np.unique for a sound clip
        y_true.append(np.unique(y)[0]) 
    
    accuracies.append(accuracy_score(y_true, y_pred))    

    if idx == 1:
        print('early break')
        break;

    print('Confusion matrix - loop: ', idx);
    #allConfusionMatrix(idx) = confusion_matrix(y_pred,y_true)
    print(confusion_matrix)

print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))

--> Starting a loop!
-> Getting data from TRAINING folder  2
-> Getting data from TRAINING folder  3
-> Getting data from TRAINING folder  4
-> Getting data from TRAINING folder  5
-> Getting data from TRAINING folder  6
-> Getting data from TRAINING folder  7
-> Getting data from TRAINING folder  8
-> Getting data from TRAINING folder  9
-> Getting data from TRAINING folder  10
-> Getting data from TESTING folder  [1]
--> Fitting model!
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
early break
Average 10 Folds Accuracy: 0.5324519230769231


In [13]:
allConfusionMatrix = []
allConfusionMatrix.append(confusion_matrix(y_pred,y_true))

Class identifiers according to UrbanSounds 8k :

0 = air_conditioner
1 = car_horn
2 = children_playing
3 = dog_bark
4 = drilling
5 = engine_idling
6 = gun_shot
7 = jackhammer
8 = siren
9 = street_music

In [16]:
allConfusionMatrix[0]

array([[ 7,  0,  1,  0,  2,  1,  0,  5,  0,  1],
       [ 0, 15,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  0, 38,  2,  2,  0,  0,  0,  0,  0],
       [ 2,  0,  9, 65,  1,  0,  5,  0,  0,  0],
       [ 0,  0,  6,  1, 57, 24,  7, 54,  0,  2],
       [22,  1,  3,  5,  3, 63,  0, 34,  6,  8],
       [ 0,  0,  0,  0,  0,  0, 21,  0,  0,  0],
       [ 2,  0,  0,  0,  5,  1,  0, 23,  0,  4],
       [ 0,  0,  4,  5,  8,  3,  0,  4, 76,  7],
       [66,  1, 39,  9, 15,  4,  0,  0,  4, 78]], dtype=int64)

# Trying with a single feature

In [27]:
features_old = features;
features = features[:,:,:,0]
featuresShape = np.shape(features)
features = features.reshape(featuresShape[0],featuresShape[1],featuresShape[2],1)


In [31]:
### Define convolutional network architecture ###
def CNN_modelDefinition():
    keras.backend.clear_session()
    
    num_filters = [24,32,64,128] 
    pool_size = (2, 2) 
    kernel_size = (3, 3)  
    input_shape = (60, 41, 1)
    num_classes = 10
    
    
    model = keras.models.Sequential()
    model.add(keras.layers.Conv2D(24, kernel_size, padding="same", input_shape=input_shape))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))

    model.add(keras.layers.Conv2D(32, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(64, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  
    model.add(keras.layers.MaxPooling2D(pool_size=pool_size))
    
    model.add(keras.layers.Conv2D(128, kernel_size, padding="same"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Activation("relu"))  

    model.add(keras.layers.GlobalMaxPooling2D())
    model.add(keras.layers.Dense(128, activation="relu"))
    model.add(keras.layers.Dense(num_classes, activation="softmax"))

    model.compile(optimizer=keras.optimizers.Adam(1e-4), 
        loss=keras.losses.SparseCategoricalCrossentropy(), 
        metrics=["accuracy"])
    return model

In [None]:
np.asarray(x_test).reshape(xShape[0],xShape[1],xShape[2],1)

In [39]:
idx = 0

for train_index, test_index in kf.split(folds):
    idx= idx+1
    
    print('--> Starting a loop!')

    x_train, y_train = [], []

    # ---------------- STEP 1 ----------------
    # Load training data from 9 of 10 folders 
    # Loop through all training folders and gather data in single feature array
    for idxTrainFolder in train_index:
        print('-> Getting data from TRAINING folder ', idxTrainFolder+1)
        # Read pre-saved features or segments of an audio file (pre-processed)
        train_data = np.load("{0}/{1}.npz".format(load_dir,folds[idxTrainFolder]), allow_pickle=True)

        # Get the 'features' and 'labels' from current train folder
        features = np.concatenate(train_data["features"], axis=0) 
        labels = np.concatenate(train_data["labels"], axis=0)

        # REDUCE Features dimensionality **********************************************************************************************
        features = features[:,:,:,0]
        featuresShape = np.shape(features)
        features = features.reshape(featuresShape[0],featuresShape[1],featuresShape[2],1)

        # Append all the 'features' and 'labels' train datasets 
        # in a single list containing all train folders data
        x_train.append(features)
        y_train.append(labels)

    # Merge all separate feature datasets in a single one (as if it was a single 'big' folder)
    x_train = np.concatenate(x_train, axis = 0).astype(np.float32)
    y_train = np.concatenate(y_train, axis = 0).astype(np.float32)
    
    # ---------------- STEP 2 ----------------
    # Load test data from 1 of 10 folders 
    # Load test data from the test folder
    print('-> Getting data from TESTING folder ', test_index+1)
    test_data = np.load("{0}/{1}.npz".format(load_dir, folds[test_index][0]), allow_pickle=True)
    x_test = test_data["features"]
    y_test = test_data["labels"]


    x_test = np.asarray(x_test)
    x_test = np.concatenate(x_test, axis = 0)
    x_test = x_test[:,:,:,0]
    xTestShape = np.shape(x_test)
    x_test = x_test.reshape(xTestShape[0],xTestShape[1],xTestShape[2],1)

    model = CNN_modelDefinition()
    
    print('--> Fitting model!')
    
    model.fit(x_train, y_train, epochs = 5, batch_size = 24, verbose = 1)
    
    # evaluate on test set/fold
    y_true, y_pred = [], []
    for x, y in zip(x_test, y_test):
        # average predictions over segments of a sound clip
        avg_p = np.argmax(np.mean(model.predict(x), axis = 0))
        y_pred.append(avg_p) 
        # pick single label via np.unique for a sound clip
        y_true.append(np.unique(y)[0]) 
    
    accuracies.append(accuracy_score(y_true, y_pred))    

    if idx == 1:
        print('early break')
        break;

    print('Confusion matrix - loop: ', idx);
    #allConfusionMatrix(idx) = confusion_matrix(y_pred,y_true)
    print(confusion_matrix)

print("Average 10 Folds Accuracy: {0}".format(np.mean(accuracies)))

--> Starting a loop!
-> Getting data from TRAINING folder  2
-> Getting data from TRAINING folder  3
-> Getting data from TRAINING folder  4
-> Getting data from TRAINING folder  5
-> Getting data from TRAINING folder  6
-> Getting data from TRAINING folder  7
-> Getting data from TRAINING folder  8
-> Getting data from TRAINING folder  9
-> Getting data from TRAINING folder  10
-> Getting data from TESTING folder  [1]
--> Fitting model!
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


ValueError: in user code:

    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\SZM1JVL\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 60, 41, 1), found shape=(None, 41, 1)
