# Yet another attempt to train a model that doesn't take ages

### Imports

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

import tensorflow as tf
# Set logging level to avoid unnecessary messages
tf.get_logger().setLevel('ERROR')
# Set autograph verbosity to avoid unnecessary messages
tf.autograph.set_verbosity(0)

import keras
import tensorflow_io as tfio
import tensorflow_probability as tfp
import tensorflow_extra as tfe

import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio

from pathlib import Path
import sys
import os
import time
import warnings
# suppress all warnings
warnings.filterwarnings("ignore")

XC_ROOTDIR = '../../data/' # directory to save data in
XC_DIR = 'test_dataset10' # subdirectory name of dataset

2024-07-02 11:28:20.023055: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 11:28:20.023108: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 11:28:20.024290: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-02 11:28:20.031967: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Read data

In [2]:
df = pd.read_csv("../../data/dataset10.csv")
df["fullfilename"] = "../" + df["fullfilename"]

### Configuration

In [3]:
class cfg:
    # random seed
    seed = 42

    # audio clip settings
    sr = 22050
    duration = 15 # the duration of the clips
    
    n_samples = duration*sr
    
    hop_length = 2048 # "stepsize" of the fft for the melspectrograms
    nfft = 4096 # windowsize of the fft for the melspectrograms
    n_mels = 128 # number of mel frequency bins
    fmax = sr/2 # maximum frequency in the melspectrograms
    input_dim = (int(duration*sr/hop_length + 1), n_mels)
    
    # training settings
    batch_size = 16
    n_epochs = 2
    
    # class labels/names
    names = list(np.unique(df.en))
    n_classes = len(names)
    labels = list(range(n_classes))
    label2name = dict(zip(labels, names))
    name2label = {v:k for k,v in label2name.items()}

# set random seed in keras
tf.keras.utils.set_random_seed(cfg.seed)

### Data generator and helper functions

In [4]:
# Generates random integer # from https://www.kaggle.com/code/wengsilu/birdclef24pretraining
def random_int(shape=[], minval=0, maxval=1):
    return tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.int32)

# Generats random float
def random_float(shape=[], minval=0.0, maxval=1.0):
    rnd = tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.float32)
    return rnd

In [5]:
def slow_load(filepath):
    audio, sr = librosa.load(filepath, sr = cfg.sr)
    # randomly pad clip if shorter
    if len(audio) < cfg.duration*sr:
        _ = np.zeros(window*sr)
        rdm = random_int(maxval = cfg.duration*sr-len(audio))
        _[rdm:rdm + len(audio)] = audio
        audio = _
    else: # select random window if clip longer
        rdm = random_int(maxval = len(audio) - cfg.duration*sr)
        audio = audio[rdm:rdm + cfg.duration*sr]
    audio = tf.convert_to_tensor(audio, dtype = tf.float32)
    return audio

def load_audio(filepath):
    # read audio
    try:
        audio = tfio.audio.AudioIOTensor(filepath, dtype = tf.float32) # lazy load the file
    except Exception as e:
        print(f"Error loading audio file {filepath} with TensorFlow I/O: {e}")
        print("Proceeding to slow load file")
        return slow_load(filepath)
    if audio.shape[0] == 0:
        print(f"Failed to load audio file {filepath.numpy} with TensorFlow I/O: shape[0] = 0")
        print("Proceeding to slow load file")
        return slow_load(filepath)
        
    rate = audio.rate
    # cut out clip of specified duration at random position
    num_samples = cfg.duration*rate
    length = tf.cast(audio.shape[0], tf.int32)
    if num_samples < length:
        rdm = random_int(maxval = length - num_samples)
        audio = audio[rdm:rdm+num_samples]
    else:
        audio = audio.to_tensor()
    audio = tf.cast(audio, tf.float32)
    # resample if necessary
    audio = tfio.audio.resample(audio, tf.cast(rate, tf.int64), cfg.sr) if rate != cfg.sr else audio
    # remove noise (tfio.audio.split() or tfio.audio.trim()?)# can't do this when the clip is already cut
    # stereo to mono
    audio = tf.reduce_mean(audio, axis=-1) if tf.shape(audio)[-1] == 2 else tf.squeeze(audio, axis = -1)
    # pad if necessary
    if tf.size(audio) < cfg.n_samples:
        missing = cfg.n_samples - tf.size(audio)
        rdm = random_int(maxval = missing)
        audio = tf.pad(audio, [[rdm, missing-rdm]]) # pad rdm zeros left and missing-rdm zeros rigth
    audio = tf.reshape(audio, [cfg.sr*cfg.duration])
    return audio

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, 
                 list_IDs, 
                 labels, 
                 batch_size=cfg.batch_size, 
                 dim=cfg.n_samples,
                 n_classes=cfg.n_classes, 
                 shuffle=True
                ):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, dim)
        # Initialization
        X = np.empty((self.batch_size, self.dim))
        y = np.empty((self.batch_size), dtype=int)
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = load_audio(df.iloc[ID].fullfilename)
            # Store class
            y[i] = cfg.name2label[df.iloc[ID].en]
        X = X.reshape(len(X), self.dim)
        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

## Model

In [6]:
from tensorflow.keras.layers import Layer, Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Input

melspec_layer = tfe.layers.MelSpectrogram(n_fft=cfg.nfft, 
                                          hop_length=cfg.hop_length, 
                                          sr=cfg.sr, 
                                          fmin=0,
                                          fmax=cfg.fmax,
                                         )

class ExpandDimsLayer(Layer):
    def __init__(self, axis, **kwargs):
        super(ExpandDimsLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, inputs):
        return tf.expand_dims(inputs, axis=self.axis)

zscore_layer = tfe.layers.ZScoreMinMax()

def build_model():
    inp = Input(shape=(cfg.n_samples,))
    
    # Spectrogram
    x = melspec_layer(inp)
    
    # Normalize
    x = zscore_layer(x)
    
    # Add a channel dimension
    x = ExpandDimsLayer(axis=-1)(x)
    
    # Base model
    x = Conv2D(32, kernel_size=(3, 3), padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2), padding="valid")(x)
    x = Conv2D(64, kernel_size=(3, 3), padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2), padding="valid")(x)
    #x = Dropout(0.4)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    #x = Dropout(0.25)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(cfg.n_classes, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=inp, outputs=output, name = "Basemodel")
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
model = build_model()
model.summary()

2024-07-02 11:28:25.227173: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-02 11:28:25.285862: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-02 11:28:25.286212: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-02 11:28:25.287429: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-07-02 11:28:25.287780: I external/local_xla/xla/stream_executor

Model: "Basemodel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 330750)]          0         
                                                                 
 mel_spectrogram (MelSpectr  (None, 128, 162)          0         
 ogram)                                                          
                                                                 
 z_score_min_max (ZScoreMin  (None, 128, 162)          0         
 Max)                                                            
                                                                 
 expand_dims_layer (ExpandD  (None, 128, 162, 1)       0         
 imsLayer)                                                       
                                                                 
 conv2d (Conv2D)             (None, 126, 160, 32)      320       
                                                         

## Training

In [8]:
from sklearn.model_selection import train_test_split

id_train, id_val, y_train, y_val = train_test_split(range(len(df)), df["en"].to_list(), test_size = 0.3, random_state = cfg.seed)

training_generator = DataGenerator(id_train, y_train)
validation_generator = DataGenerator(id_val, y_val)

In [None]:
hist = model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True, verbose = 2, epochs = cfg.n_epochs)

2024-07-02 11:28:26.578858: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
2024-07-02 11:28:26.582853: W tensorflow_io/core/kernels/audio_video_mp3_kernels.cc:271] libmp3lame.so.0 or lame functions are not available


Epoch 1/2


2024-07-02 11:28:33.489605: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-02 11:28:33.524019: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-02 11:28:33.558327: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-02 11:28:33.595494: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-02 11:28:33.628416: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization error
2024-07-02 11:28:33.662970: F external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:156] Failed setting context: CUDA_ERROR_NOT_INITIALIZED: initialization erro