This demo as been adapted from https://wandb.ai/mostafaibrahim17/ml-articles/reports/An-Introduction-to-Audio-Classification-with-Keras--Vmlldzo0MDQzNDUy

In [1]:
import arrow
import os
import numpy as np
import pandas as pd
import librosa
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

DATA = '/kaggle/input/urbansound8k'
METADATA = '/kaggle/input/urbansound8k/UrbanSound8K.csv'
SAMPLE_RATE = 22050


def load_data(data_path, metadata_path):
    features = []
    labels = []
    metadata = pd.read_csv(metadata_path)

    for index, row in metadata.iterrows():
        file_path = os.path.join(data_path, 'fold{}'.format(row['fold']), '{}'.format(row['slice_file_name']))
        audio, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
        mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        features.append(mfccs_scaled)
        labels.append(row['class'])
        if index > 0 and index % 1000 == 0:
            print('{}: row: {} file: {}'.format(arrow.now() - time_start, index, row['slice_file_name']))
    return np.array(features), np.array(labels)

time_start = arrow.now()
features, labels = load_data(data_path=DATA, metadata_path=METADATA)
print('{} data load complete'.format(arrow.now() - time_start))

2024-04-06 19:17:48.104696: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-06 19:17:48.104881: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-06 19:17:48.283152: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


0:01:13.458540: row: 1000 file: 122690-6-0-0.wav
0:02:05.553570: row: 2000 file: 147926-0-0-44.wav
0:03:02.418866: row: 3000 file: 162434-6-2-0.wav




0:03:57.713684: row: 4000 file: 178260-7-1-9.wav
0:04:57.318837: row: 5000 file: 195969-0-0-19.wav
0:05:49.187344: row: 6000 file: 30204-0-0-11.wav
0:06:37.693113: row: 7000 file: 60605-9-0-90.wav
0:07:28.561668: row: 8000 file: 77751-4-9-1.wav




0:08:05.378160 data load complete


In [2]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
labels_onehot = to_categorical(encoder.fit_transform(labels))


In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels_onehot, test_size=0.2, random_state=42, stratify=labels_onehot)


In [4]:
from keras.layers import Activation
from keras.layers import Conv1D
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import MaxPooling1D
from keras.models import Sequential

model = Sequential()
model.add(Input(shape=(X_train.shape[1], 1)))
model.add(Conv1D(64, 3, padding='same', activation='relu',))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(encoder.classes_), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [5]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
print(len(X_train), len(X_test))

6985 1747


In [6]:
def make_predictions(model, le, file_path):
    audio, sample_rate = librosa.load(file_path, sr=22050)
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    features = mfccs_scaled.reshape(1, mfccs_scaled.shape[0], 1)
    predicted_vector = model.predict(features)
    predicted_class_index = np.argmax(predicted_vector, axis=-1)
    return le.inverse_transform(predicted_class_index)[0]


In [7]:
test_files = [
    ('/kaggle/input/urbansound8k/fold1/101415-3-0-2.wav', 'Dog bark'),
    ('/kaggle/input/urbansound8k/fold1/101415-3-0-3.wav', 'Dog bark'),
    ('/kaggle/input/urbansound8k/fold1/102305-6-0-0.wav', 'Gun shots'),
    ('/kaggle/input/urbansound8k/fold1/103074-7-0-2.wav', 'Jack hammer'),
    ('/kaggle/input/urbansound8k/fold1/103074-7-4-3.wav', 'Jack hammer')
]


old_predictions = {
    file_path: make_predictions(model, encoder, file_path) for file_path, true_label in test_files
}

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step


In [8]:
import arrow

time_start = arrow.now()
history = model.fit(X_train, y_train, batch_size=256, epochs=100, validation_data=(X_test, y_test), verbose=0, callbacks=None)
print('{} done training'.format(arrow.now() - time_start))

0:02:10.752219 done training


In [9]:
from plotly import express
express.line(data_frame=pd.DataFrame(data=history.history,).reset_index().rename(columns={'index': 'epoch'}), x='epoch', y=list(history.history.keys()))

In [10]:
new_predictions = {
    file_path: make_predictions(model, encoder, file_path) for file_path, true_label in test_files
}
new_predictions

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


{'/kaggle/input/urbansound8k/fold1/101415-3-0-2.wav': 'dog_bark',
 '/kaggle/input/urbansound8k/fold1/101415-3-0-3.wav': 'dog_bark',
 '/kaggle/input/urbansound8k/fold1/102305-6-0-0.wav': 'gun_shot',
 '/kaggle/input/urbansound8k/fold1/103074-7-0-2.wav': 'jackhammer',
 '/kaggle/input/urbansound8k/fold1/103074-7-4-3.wav': 'jackhammer'}