This is a simple demo based on the code found here:
https://wandb.ai/mostafaibrahim17/ml-articles/reports/An-Introduction-to-Audio-Classification-with-Keras--Vmlldzo0MDQzNDUy

Loading and encoding all the data takes about 90 minutes on a CPU. Let's start our investigation by loading up all the file names and picking some species that are well represented to examine more closely.

In [1]:
import arrow
import glob
import os
import pandas as pd

PATHNAME = '/kaggle/input/birdclef-2024/train_audio/*/*.ogg'

files_df = pd.DataFrame(data=list(glob.glob(pathname=PATHNAME, recursive=True)), columns=['file'])
files_df['short name'] = files_df['file'].apply(func=os.path.basename)
files_df['species'] = files_df['file'].apply(func=lambda x: x.split('/')[-2])
print('file count: {}'.format(len(files_df)))
time_start = arrow.now()
print('{} done'.format(arrow.now() - time_start))

file count: 24459
0:00:00.000152 done


In [2]:
files_df['species'].nunique()

182

We have a lot of classes and they are not all equally well represented, so let's look at the top classes.

In [3]:
from plotly import express
express.histogram(data_frame=files_df['species'].value_counts().to_frame().head(n=30).reset_index(), x='species', y='count')

Let's start with a top quantile.

In [4]:
import arrow
import librosa
import numpy as np
import os
import pandas as pd
from glob import glob

SAMPLE_RATE = 22050
SPECIES = files_df['species'].value_counts().head(n=22).index.tolist()
MFCC = 40 # this is a lot of features; do we need them all?

def load_data(metadata):
    features = []
    labels = []
    count = 0
    for index, row in metadata[metadata['species'].isin(SPECIES)].iterrows():
        file_path = row['file']
        audio, sample_rate = librosa.load(file_path, sr=SAMPLE_RATE)
        mfccs = librosa.feature.mfcc(y=audio, sr=SAMPLE_RATE, n_mfcc=MFCC)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        features.append(mfccs_scaled)
        labels.append(row['species'])
        if count > 0 and count % 1000 == 0:
            print('{}: row: {} file: {}'.format(arrow.now() - time_start, index, row['short name']))
        count += 1
    return np.array(features), np.array(labels)

time_start = arrow.now()
features, labels = load_data(metadata=files_df)
print('{} data load complete'.format(arrow.now() - time_start))

0:03:03.734303: row: 2193 file: XC603386.ogg
0:05:51.877765: row: 5526 file: XC500457.ogg
0:08:28.179878: row: 7774 file: XC650350.ogg
0:13:10.320006: row: 9998 file: XC748166.ogg
0:15:46.414380: row: 11545 file: XC591023.ogg
0:19:04.861299: row: 14304 file: XC550542.ogg
0:21:49.808727: row: 15554 file: XC181196.ogg
0:30:15.670521: row: 16865 file: XC782753.ogg
0:32:49.908951: row: 18702 file: XC215034.ogg
0:36:25.510222: row: 22763 file: XC142466.ogg
0:39:45.494087 data load complete


In [5]:
import arrow
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

time_start = arrow.now()
encoder = LabelEncoder()
labels_onehot = to_categorical(encoder.fit_transform(labels))
print('{} encoded labels'.format(arrow.now() - time_start))

2024-04-06 21:32:50.202854: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-06 21:32:50.203100: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-06 21:32:50.360179: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


0:00:00.002754 encoded labels


In [6]:
import arrow
from sklearn.model_selection import train_test_split

time_start = arrow.now()
X_train, X_test, y_train, y_test = train_test_split(features, labels_onehot, test_size=0.2, random_state=2024, stratify=labels_onehot)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
print('{} split: train: {} test: {}'.format(arrow.now() - time_start, len(X_train), len(X_test)))

0:00:00.333035 split: train: 8800 test: 2200


In [7]:
from keras.layers import Activation
from keras.layers import Conv1D
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import MaxPooling1D
from keras.models import Sequential

model = Sequential()
model.add(Input(shape=(X_train.shape[1], 1)))
model.add(Conv1D(64, 3, padding='same', activation='relu',))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(encoder.classes_), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [8]:
import arrow

time_start = arrow.now()
history = model.fit(X_train[:-len(X_test)], y_train[:-len(y_test)], batch_size=256, epochs=100, validation_data=(X_train[-len(X_test):], y_test[-len(y_test):]), verbose=0, callbacks=None)
print('{} done training'.format(arrow.now() - time_start))

0:02:04.934477 done training


In [9]:
from plotly import express
express.line(data_frame=pd.DataFrame(data=history.history,).reset_index().rename(columns={'index': 'epoch'}), x='epoch', y=list(history.history.keys()))

In [10]:
y_pred = model.predict(x=X_test)

[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_pred=[np.argmax(item) for item in y_pred], y_true=[np.argmax(item) for item in y_test]))

              precision    recall  f1-score   support

           0       0.33      0.31      0.32       100
           1       0.26      0.21      0.23       100
           2       0.36      0.46      0.40       100
           3       0.39      0.38      0.38       100
           4       0.55      0.56      0.56       100
           5       0.55      0.56      0.56       100
           6       0.24      0.27      0.25       100
           7       0.38      0.40      0.39       100
           8       0.28      0.30      0.29       100
           9       0.33      0.39      0.36       100
          10       0.73      0.62      0.67       100
          11       0.39      0.36      0.37       100
          12       0.21      0.25      0.23       100
          13       0.50      0.49      0.49       100
          14       0.39      0.39      0.39       100
          15       0.45      0.42      0.44       100
          16       0.52      0.45      0.48       100
          17       0.38    

In [12]:
def make_predictions(model, le, file_path):
    audio, sample_rate = librosa.load(file_path, sr=22050)
    mfccs = librosa.feature.mfcc(y=audio, sr=22050, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    predicted_vector = model.predict(x=mfccs_scaled.reshape(1, mfccs_scaled.shape[0], 1), verbose=0)
    predicted_class_index = np.argmax(predicted_vector, axis=-1)
    return le.inverse_transform(predicted_class_index)[0]
