# Build Dataset
In this notebook the build dataset function, needed for the datapipeline is implemented

This notebook is inspired by and partly copied from https://www.kaggle.com/code/wengsilu/birdclef24pretraining

### Imports

In [2]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import tensorflow as tf
# Set logging level to avoid unnecessary messages
tf.get_logger().setLevel('ERROR')
# Set autograph verbosity to avoid unnecessary messages
tf.autograph.set_verbosity(0)

import tensorflow_io as tfio
import tensorflow_extra as tfe
import tensorflow_probability as tfp

2024-06-28 14:15:51.993002: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 14:15:51.993109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 14:15:51.994737: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-28 14:15:52.005434: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Set device and strategy
If GPUs are available, use them

In [3]:
tf.config.set_visible_devices([], 'GPU')
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

#gpus = tf.config.list_logical_devices('GPU')
#ngpu = len(gpus) # Check number of GPUs
#if ngpu:
#    # Set GPU strategy
#    strategy = tf.distribute.MirroredStrategy(gpus) # single-GPU or multi-GPU
#    # Print GPU details
#    print("> Running on GPU", end=' | ')
#    print("Num of GPUs: ", ngpu)
#    device='GPU'
#else:
#    # If no GPUs are available, use CPU
print("> Running on CPU")
strategy = tf.distribute.get_strategy()
device='CPU'

> Running on CPU


In [4]:
# To find out which devices your operations and tensors are assigned to
#tf.debugging.set_log_device_placement(True)

# Create some tensors and perform an operation
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


## Load dataframe (for testing)

In [5]:
df = pd.read_csv("../../data/dataset10.csv")

## Configurations

In [6]:
class cfg:
    # random seed
    seed = 42

    # audio clip settings
    sr = 24000
    duration = 10
    desired_length = duration*sr
    
    duration = 15 # the duration of the clips
    n_samples = duration*sr
    hop_length = 2048 # "stepsize" of the fft for the melspectrograms
    nfft = 4096 # windowsize of the fft for the melspectrograms
    n_mels = 128 # number of mel frequency bins
    fmax = sr/2 # maximum frequency in the melspectrograms
    input_dim = (int(duration*sr/hop_length + 1), n_mels)
    
    # data processing settings
    batch_size = 16
    shuffle_buffer = 256 # idk Number of elements from the dataset to buffer for shuffling.
    
    # class labels/names
    names = list(np.unique(df.en))
    num_classes = len(names)
    labels = list(range(num_classes))
    label2name = dict(zip(labels, names))
    name2label = {v:k for k,v in label2name.items()}

    # set device
    device = device

# set random seed in keras
tf.keras.utils.set_random_seed(cfg.seed)

# Function to load and prepare audio files

In [7]:
# Generates random integer # from https://www.kaggle.com/code/wengsilu/birdclef24pretraining
def random_int(shape=[], minval=0, maxval=1):
    return tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.int32)

# Generats random float
def random_float(shape=[], minval=0.0, maxval=1.0):
    rnd = tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.float32)
    return rnd

In [8]:
def audio_loader(with_labels = True, cfg = cfg, num_classes = cfg.num_classes):
    def decode(filepath):
        # read audio
        audio = tfio.audio.AudioIOTensor(filepath, dtype = tf.float32) # lazy load the file
        rate = audio.rate
        # cut out clip of specified duration at random position
        num_samples = cfg.duration*rate
        length = tf.cast(audio.shape[0], tf.int32)
        if num_samples < length:
            rdm = random_int(maxval = length - num_samples)
            audio = audio[rdm:rdm+num_samples]
        else:
            audio = audio.to_tensor()
        audio = tf.cast(audio, tf.float32)
        # resample if necessary
        audio = tfio.audio.resample(audio, tf.cast(rate, tf.int64), cfg.sr) if rate != cfg.sr else audio
        # remove noise (tfio.audio.split() or tfio.audio.trim()?)# can't do this when the clip is already cut
        # stereo to mono
        audio = tf.reduce_mean(audio, axis=-1) if tf.shape(audio)[-1] == 2 else tf.squeeze(audio, axis = -1)
        # pad if necessary
        if tf.size(audio) < cfg.desired_length:
            missing = cfg.desired_length - tf.size(audio)
            rdm = random_int(maxval = missing)
            audio = tf.pad(audio, [[rdm, missing-rdm]]) # pad rdm zeros left and missing-rdm zeros rigth
        audio = tf.reshape(audio, [cfg.sr*cfg.duration])
        return audio

    def get_target(target):          
        target = tf.reshape(target, [1])
        target = tf.cast(tf.one_hot(target, num_classes), tf.float32) 
        target = tf.reshape(target, [num_classes])
        return target
    
    def decode_with_labels(path, label):
        label = get_target(label)
        return decode(path), label

    return decode_with_labels if with_labels else decode

# Build dataset

In [9]:
def build_dataset(paths, labels=None, batch_size=cfg.batch_size, target_size=[128, 256], # idk yet about target size
                  audio_decode_fn=None,
                  num_classes=cfg.num_classes,
                  cache=True, cache_dir="", drop_remainder=False,
                  repeat=True, shuffle=cfg.shuffle_buffer):
    """
    Creates a TensorFlow dataset from the given paths and labels.
    
    Args:
        paths (list): A list of file paths to the audio files.
        labels (list): A list of corresponding labels for the audio files.
        batch_size (int): Batch size for the created dataset.
        target_size (list): A list of target image size for the spectrograms.
        audio_decode_fn (function): A function to decode the audio file.
        cache (bool): Whether to cache the dataset or not.
        cache_dir (str): Directory path to cache the dataset.
        drop_remainder (bool): Whether to drop the last batch if it is smaller than batch_size.
        repeat (bool): Whether to repeat the dataset or not.
        shuffle (int): Number of elements from the dataset to buffer for shuffling.
        
    Returns:
        ds (tf.data.Dataset): A TensorFlow dataset.
    """
    # Create cache directory if cache is enabled
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    # Set default audio decode function if not provided
    if audio_decode_fn is None:
        audio_decode_fn = audio_loader(with_labels = labels is not None, cfg = cfg, num_classes = cfg.num_classes)
        
    # Set TensorFlow AUTOTUNE option
    AUTO = tf.data.experimental.AUTOTUNE # hopefully optimizes data pipeline and decreases loading times
    # Create slices based on whether labels are provided
    slices = (paths,) if labels is None else (paths, labels)
    # Create TensorFlow dataset from slices
    ds = tf.data.Dataset.from_tensor_slices(slices)
    # Map audio decode function to dataset
    ds = ds.map(audio_decode_fn, num_parallel_calls=AUTO)
    # Cache dataset in memory if cache is enabled
    ds = ds.cache(cache_dir) if cache else ds
    # Repeat dataset indefinitely if repeat is enabled
    ds = ds.repeat() if repeat else ds
    # Create TensorFlow dataset options
    opt = tf.data.Options()
    # Shuffle dataset if shuffle is enabled
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=cfg.seed)
        opt.experimental_deterministic = False
    if cfg.device=='GPU':
        # If the device is a GPU, turn off auto-sharding to avoid performance issues
        opt.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    # Set the options for the dataset
    ds = ds.with_options(opt)
    # Batch the dataset with the specified batch size
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    # Prefetch the next batch of data to improve performance
    ds = ds.prefetch(AUTO)
    return ds

In [10]:
df_sample = df.sample(frac=0.2, replace=False, random_state=cfg.seed)
df_sample["fullfilename"] = "../" + df_sample["fullfilename"]
paths = df_sample.fullfilename.tolist()
names = df_sample.en.tolist()
labels = []
for name in names:
    labels.append(cfg.name2label[name])

ds = build_dataset(paths, labels, cache=False)

2024-06-28 14:15:57.761009: I tensorflow_io/core/kernels/cpu_check.cc:128] Your CPU supports instructions that this TensorFlow IO binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
2024-06-28 14:15:57.766077: W tensorflow_io/core/kernels/audio_video_mp3_kernels.cc:271] libmp3lame.so.0 or lame functions are not available


In [11]:
def plot_batch(batch, row=3, col=3, label2name=None,):
    """Plot one batch data"""
    if isinstance(batch, tuple) or isinstance(batch, list):
        audios, tars = batch
    else:
        audios = batch
        tars = None
    plt.figure(figsize=(col*5, row*3))
    for idx in range(row*col):
        ax = plt.subplot(row, col, idx+1)
        plt.plot(audios[idx].numpy(), color=cmap(0.1))
        if tars is not None:
            label = tars[idx].numpy().argmax()
            name = label2name[label]
            plt.title(name)
    plt.tight_layout()
    plt.show()

In [12]:
#audios, labels = next(iter(ds))

In [13]:
#plot_batch((audios, labels), label2name=cfg.label2name2)

In [14]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Input

melspec_layer = tfe.layers.MelSpectrogram(n_fft=cfg.nfft, 
                                          hop_length=cfg.hop_length, 
                                          sr=cfg.sr, 
                                          fmin=0,
                                          fmax=cfg.fmax,
                                         )

zscore_layer = tfe.layers.ZScoreMinMax()

def build_model():
    inp = Input(shape=(cfg.n_samples,))
    
    # Spectrogram
    x = melspec_layer(inp)
    
    # Normalize
    x = zscore_layer(x)
    
    # Add a channel dimension
    x = tf.expand_dims(x, -1)
    
    # Base model
    x = Conv2D(32, kernel_size=(3, 3), padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2), padding="valid")(x)
    x = Conv2D(64, kernel_size=(3, 3), padding='valid', activation='relu')(x)
    x = MaxPooling2D(pool_size=(2, 2), padding="valid")(x)
    x = Dropout(0.4)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(cfg.num_classes, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=inp, outputs=output)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [15]:
# Build the model
model = build_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 360000)]          0         
                                                                 
 mel_spectrogram (MelSpectr  (None, 128, 176)          0         
 ogram)                                                          
                                                                 
 z_score_min_max (ZScoreMin  (None, 128, 176)          0         
 Max)                                                            
                                                                 
 tf.expand_dims (TFOpLambda  (None, 128, 176, 1)       0         
 )                                                               
                                                                 
 conv2d (Conv2D)             (None, 126, 174, 32)      320       
                                                             

In [16]:
df["fullfilename"] = "../" + df["fullfilename"]

In [17]:
from sklearn.model_selection import train_test_split
id_train, id_val, y_train, y_val = train_test_split(range(len(df)), df["en"].to_list(), test_size = 0.3, random_state = cfg.seed)

#paths, labels=None, batch_size=cfg.batch_size, target_size=[128, 256], # idk yet about target size
#                  audio_decode_fn=None,
#                  num_classes=cfg.num_classes,
#                  cache=True, cache_dir="", drop_remainder=False,
#                  repeat=True, shuffle=cfg.shuffle_buffer):

paths_train = list(df.iloc[id_train].fullfilename)
paths_val = list(df.iloc[id_val].fullfilename)

label_train = []
for y in y_train:
    label_train.append(cfg.name2label[y])
label_val = []
for y in y_val:
    label_val.append(cfg.name2label[y])

In [18]:
train_ds = build_dataset(paths_train, label_train)
valid_ds = build_dataset(paths_val, label_val)

In [19]:
history = model.fit(
        train_ds, 
        epochs=2, 
        validation_data=valid_ds,
        steps_per_epoch=len(paths_train)/cfg.batch_size
    )

Epoch 1/2


InvalidArgumentError: Graph execution error:

Detected at node Reshape_2 defined at (most recent call last):
<stack traces unavailable>
Input to reshape is a tensor with 240000 values, but the requested shape has 360000
	 [[{{node Reshape_2}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_2865]