# Build Dataset
In this notebook the build dataset function, needed for the datapipeline is implemented

This notebook is inspired by and partly copied from https://www.kaggle.com/code/wengsilu/birdclef24pretraining

### Imports

In [2]:
import tensorflow_extra as tfe
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import tensorflow as tf
# Set logging level to avoid unnecessary messages
tf.get_logger().setLevel('ERROR')
# Set autograph verbosity to avoid unnecessary messages
tf.autograph.set_verbosity(0)

import tensorflow_io as tfio
import tensorflow_probability as tfp

2024-06-24 12:08:49.061085: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-24 12:08:49.061142: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-24 12:08:49.062178: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-24 12:08:49.070052: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Set device and strategy
If GPUs are available, use them

In [3]:
tf.config.set_visible_devices([], 'GPU')
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

#gpus = tf.config.list_logical_devices('GPU')
#ngpu = len(gpus) # Check number of GPUs
#if ngpu:
#    # Set GPU strategy
#    strategy = tf.distribute.MirroredStrategy(gpus) # single-GPU or multi-GPU
#    # Print GPU details
#    print("> Running on GPU", end=' | ')
#    print("Num of GPUs: ", ngpu)
#    device='GPU'
#else:
#    # If no GPUs are available, use CPU
print("> Running on CPU")
strategy = tf.distribute.get_strategy()
device='CPU'

> Running on CPU


2024-06-24 12:08:51.635602: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-24 12:08:51.665302: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-06-24 12:08:51.666076: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [4]:
# To find out which devices your operations and tensors are assigned to
tf.debugging.set_log_device_placement(True)

# Create some tensors and perform an operation
a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
c = tf.matmul(a, b)

print(c)

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MatMul in device /job:localhost/replica:0/task:0/device:CPU:0
tf.Tensor(
[[22. 28.]
 [49. 64.]], shape=(2, 2), dtype=float32)


2024-06-24 12:08:51.703883: I tensorflow/core/common_runtime/placer.cc:125] input: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.703942: I tensorflow/core/common_runtime/placer.cc:125] _EagerConst: (_EagerConst): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.703951: I tensorflow/core/common_runtime/placer.cc:125] output_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.716015: I tensorflow/core/common_runtime/placer.cc:125] a: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.716099: I tensorflow/core/common_runtime/placer.cc:125] b: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.716117: I tensorflow/core/common_runtime/placer.cc:125] MatMul: (MatMul): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:51.716233: I tensorflow/core/common_runtime/placer.cc:125] product_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0


## Load dataframe (for testing)

In [5]:
df = pd.read_csv("../../data/dataset10.csv")

## Configurations

In [6]:
class cfg:
    # random seed
    seed = 42

    # audio clip settings
    sr = 24000
    duration = 10
    desired_length = duration*sr

    # data processing settings
    batch_size = 16
    shuffle_buffer = 256 # idk Number of elements from the dataset to buffer for shuffling.
    
    # class labels/names
    names = list(np.unique(df.en))
    num_classes = len(names)
    labels = list(range(num_classes))
    label2name = dict(zip(labels, names))
    name2label = {v:k for k,v in label2name.items()}

    # set device
    device = device

# set random seed in keras
tf.keras.utils.set_random_seed(cfg.seed)

# Function to load and prepare audio files

In [7]:
# Generates random integer # from https://www.kaggle.com/code/wengsilu/birdclef24pretraining
def random_int(shape=[], minval=0, maxval=1):
    return tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.int32)

# Generats random float
def random_float(shape=[], minval=0.0, maxval=1.0):
    rnd = tf.random.uniform(shape=shape, minval=minval, maxval=maxval, dtype=tf.float32)
    return rnd

In [8]:
def audio_loader(with_labels = True, cfg = cfg, num_classes = cfg.num_classes):
    def decode(filepath):
        # read audio
        audio = tfio.audio.AudioIOTensor(filepath, dtype = tf.float32) # lazy load the file
        rate = audio.rate
        # cut out clip of specified duration at random position
        num_samples = cfg.duration*rate
        length = tf.cast(audio.shape[0], tf.int32)
        if num_samples < length:
            rdm = random_int(maxval = length - num_samples)
            audio = audio[rdm:rdm+num_samples]
        else:
            audio = audio.to_tensor()
        audio = tf.cast(audio, tf.float32)
        # resample if necessary
        audio = tfio.audio.resample(audio, tf.cast(rate, tf.int64), cfg.sr) if rate != cfg.sr else audio
        # remove noise (tfio.audio.split() or tfio.audio.trim()?)# can't do this when the clip is already cut
        # stereo to mono
        audio = tf.reduce_mean(audio, axis=-1) if tf.shape(audio)[-1] == 2 else tf.squeeze(audio, axis = -1)
        # pad if necessary
        if tf.size(audio) < cfg.desired_length:
            missing = cfg.desired_length - tf.size(audio)
            rdm = random_int(maxval = missing)
            audio = tf.pad(audio, [[rdm, missing-rdm]]) # pad rdm zeros left and missing-rdm zeros rigth
        audio = tf.reshape(audio, [cfg.sr*cfg.duration])
        return audio

    def get_target(target):          
        target = tf.reshape(target, [1])
        target = tf.cast(tf.one_hot(target, num_classes), tf.float32) 
        target = tf.reshape(target, [num_classes])
        return target
    
    def decode_with_labels(path, label):
        label = get_target(label)
        return decode(path), label

    return decode_with_labels if with_labels else decode

# Build dataset

In [9]:
def build_dataset(paths, labels=None, batch_size=cfg.batch_size, target_size=[128, 256], # idk yet about target size
                  audio_decode_fn=None,
                  num_classes=cfg.num_classes,
                  cache=True, cache_dir="", drop_remainder=False,
                  repeat=True, shuffle=cfg.shuffle_buffer):
    """
    Creates a TensorFlow dataset from the given paths and labels.
    
    Args:
        paths (list): A list of file paths to the audio files.
        labels (list): A list of corresponding labels for the audio files.
        batch_size (int): Batch size for the created dataset.
        target_size (list): A list of target image size for the spectrograms.
        audio_decode_fn (function): A function to decode the audio file.
        cache (bool): Whether to cache the dataset or not.
        cache_dir (str): Directory path to cache the dataset.
        drop_remainder (bool): Whether to drop the last batch if it is smaller than batch_size.
        repeat (bool): Whether to repeat the dataset or not.
        shuffle (int): Number of elements from the dataset to buffer for shuffling.
        
    Returns:
        ds (tf.data.Dataset): A TensorFlow dataset.
    """
    # Create cache directory if cache is enabled
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    # Set default audio decode function if not provided
    if audio_decode_fn is None:
        audio_decode_fn = audio_loader(with_labels = labels is not None, cfg = cfg, num_classes = cfg.num_classes)
        
    # Set TensorFlow AUTOTUNE option
    AUTO = tf.data.experimental.AUTOTUNE # hopefully optimizes data pipeline and decreases loading times
    # Create slices based on whether labels are provided
    slices = (paths,) if labels is None else (paths, labels)
    # Create TensorFlow dataset from slices
    ds = tf.data.Dataset.from_tensor_slices(slices)
    # Map audio decode function to dataset
    ds = ds.map(audio_decode_fn, num_parallel_calls=AUTO)
    # Cache dataset in memory if cache is enabled
    ds = ds.cache(cache_dir) if cache else ds
    # Repeat dataset indefinitely if repeat is enabled
    ds = ds.repeat() if repeat else ds
    # Create TensorFlow dataset options
    opt = tf.data.Options()
    # Shuffle dataset if shuffle is enabled
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=cfg.seed)
        opt.experimental_deterministic = False
    if cfg.device=='GPU':
        # If the device is a GPU, turn off auto-sharding to avoid performance issues
        opt.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    # Set the options for the dataset
    ds = ds.with_options(opt)
    # Batch the dataset with the specified batch size
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    # Prefetch the next batch of data to improve performance
    ds = ds.prefetch(AUTO)
    return ds

In [10]:
df_sample = df.sample(frac=0.2, replace=False, random_state=cfg.seed)
df_sample["fullfilename"] = "../" + df_sample["fullfilename"]
paths = df_sample.fullfilename.tolist()
names = df_sample.en.tolist()
labels = []
for name in names:
    labels.append(cfg.name2label[name])

ds = build_dataset(paths, labels, cache=False)

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op TensorSliceDataset in device /job:localhost/replica:0/task:0/device:CPU:0


2024-06-24 12:08:56.866926: I tensorflow/core/common_runtime/placer.cc:125] input: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.866989: I tensorflow/core/common_runtime/placer.cc:125] _EagerConst: (_EagerConst): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.866998: I tensorflow/core/common_runtime/placer.cc:125] output_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.871269: I tensorflow/core/common_runtime/placer.cc:125] input: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.871332: I tensorflow/core/common_runtime/placer.cc:125] _EagerConst: (_EagerConst): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.871341: I tensorflow/core/common_runtime/placer.cc:125] output_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:56.877661: I tensorflow/core/common_runtime/placer.cc:125] components_0: (_Arg): /job:localhost/replica:0/task:0/device

Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ParallelMapDatasetV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op RepeatDataset in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Equal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Equal in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LogicalAnd in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op _EagerConst in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op SelectV2 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op DummySeedGenerator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op ShuffleDatasetV3 in devic

2024-06-24 12:08:57.678326: I tensorflow/core/common_runtime/placer.cc:125] input: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.678381: I tensorflow/core/common_runtime/placer.cc:125] _EagerConst: (_EagerConst): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.678390: I tensorflow/core/common_runtime/placer.cc:125] output_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.682172: I tensorflow/core/common_runtime/placer.cc:125] input__dataset: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.682222: I tensorflow/core/common_runtime/placer.cc:125] num__parallel__calls: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.682233: I tensorflow/core/common_runtime/placer.cc:125] ParallelMapDatasetV2: (ParallelMapDatasetV2): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:08:57.682239: I tensorflow/core/common_runtime/placer.cc:125] handle_RetVal: (_Retval): /jo

In [11]:
def plot_batch(batch, row=3, col=3, label2name=None,):
    """Plot one batch data"""
    if isinstance(batch, tuple) or isinstance(batch, list):
        audios, tars = batch
    else:
        audios = batch
        tars = None
    plt.figure(figsize=(col*5, row*3))
    for idx in range(row*col):
        ax = plt.subplot(row, col, idx+1)
        plt.plot(audios[idx].numpy(), color=cmap(0.1))
        if tars is not None:
            label = tars[idx].numpy().argmax()
            name = label2name[label]
            plt.title(name)
    plt.tight_layout()
    plt.show()

In [None]:
audios, labels = next(iter(ds))

Executing op AnonymousIteratorV3 in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op MakeIterator in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op IteratorGetNext in device /job:localhost/replica:0/task:0/device:CPU:0


2024-06-24 12:09:20.245221: I tensorflow/core/common_runtime/placer.cc:125] handle_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.245285: I tensorflow/core/common_runtime/placer.cc:125] AnonymousIteratorV3: (AnonymousIteratorV3): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.248309: I tensorflow/core/common_runtime/placer.cc:125] dataset: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.248359: I tensorflow/core/common_runtime/placer.cc:125] iterator: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.248372: I tensorflow/core/common_runtime/placer.cc:125] MakeIterator: (MakeIterator): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.300748: I tensorflow/core/common_runtime/placer.cc:125] iterator: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
2024-06-24 12:09:20.300810: I tensorflow/core/common_runtime/placer.cc:125] IteratorGetNext: (IteratorGetNext): /job:loca

_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
input__dataset: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
buffer__size: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
PrefetchDataset: (PrefetchDataset): /job:localhost/replica:0/task:0/device:CPU:0
handle_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
handle_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
AnonymousIteratorV3: (AnonymousIteratorV3): /job:localhost/replica:0/task:0/device:CPU:0
dataset: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
iterator: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
MakeIterator: (MakeIterator): /job:localhost/replica:0/task:0/device:CPU:0
iterator: (_Arg): /job:localhost/replica:0/task:0/device:CPU:0
IteratorGetNext: (IteratorGetNext): /job:localhost/replica:0/task:0/device:CPU:0
components_0_RetVal: (_Retval): /job:localhost/replica:0/task:0/device:CPU:0
components_1_RetVal: (_Retval): /job:localhost/replica:0/task:0/de

In [12]:
plot_batch((audios, labels), label2name=cfg.label2name2)

NameError: name 'audios' is not defined