In [1]:
!pip install -q datasets transformers huggingface_hub audiomentations

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
from datasets import Audio, ClassLabel, Image, IterableDataset
import numpy as np
import PIL
from PIL import Image
import shutil
import os

### Download FMA archive



In [3]:
%time
!wget https://os.unil.cloud.switch.ch/fma/fma_small.zip
!unzip -qq fma_small.zip

# Remove bad files
import os
os.remove('/content/fma_small/108/108925.mp3')
os.remove('/content/fma_small/099/099134.mp3')
os.remove('/content/fma_small/133/133297.mp3')

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 7.39 µs
--2023-12-16 18:58:44--  https://os.unil.cloud.switch.ch/fma/fma_small.zip
Resolving os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)... 86.119.28.16, 2001:620:5ca1:201::214
Connecting to os.unil.cloud.switch.ch (os.unil.cloud.switch.ch)|86.119.28.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7679594875 (7.2G) [application/zip]
Saving to: ‘fma_small.zip’


2023-12-16 19:07:47 (13.5 MB/s) - ‘fma_small.zip’ saved [7679594875/7679594875]



### Configuration and setup

In [4]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Configuration for a per-cluster dataset
config = {
    'augmenatation': 'large',
    'num_bg_sounds': 40,
    'num_train_augmentations': 8,
    'num_val_augmentations': 0,
}

### Download and preprocess dataset of background noises
Used as a component of audio augmentations

In [6]:
import random
import soundfile as sf

BG_NOISE_FOLDER = 'bg_noise'

# Save noise files locally
def save_bg_noise(bg_noise):

    if not os.path.isdir(BG_NOISE_FOLDER):
        os.mkdir(BG_NOISE_FOLDER)

    for noise in bg_noise:
        sf.write(
            os.path.join(BG_NOISE_FOLDER, noise['filename']),
            librosa.resample(
                noise['audio']['array'],
                orig_sr= noise['audio']['sampling_rate'],
                target_sr= 16000),
            16000)

In [7]:
%%time
# normally takes 30 sec
from datasets import load_dataset
import librosa
import os

# Load dataset of background noises, used for augmentation
# and save each sound in a wav file locally
save_bg_noise( load_dataset(
    'ashraq/esc50',
    split=f"train[:{config['num_bg_sounds']}]"
    ) )

Downloading readme:   0%|          | 0.00/345 [00:00<?, ?B/s]



Downloading metadata:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/387M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

### Libraries for spectrogram images creation

In [8]:
import PIL
from PIL import Image

def spec2image( spec ):
    '''
    Inputs:
        spec: Spectrogram as a numpy array
    Output:
        PIL Image
    '''
    spec = (spec - spec.min()) / (spec.max() - spec.min()) * 255
    spec = spec.astype('uint8')
    pil_image = PIL.Image.fromarray(spec)

    return pil_image

In [9]:
from transformers import WhisperFeatureExtractor
whisper_feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

def audio2image( array ):
    # Extract spectrogram from Whisper features. It's the first Whisper feature
    features = whisper_feature_extractor(
        array,
        sampling_rate=16_000,
        padding=True
    )
    spec =  features['input_features'][0]

    # Transform spectrogram to an image
    img = spec2image( spec ).resize(
        (298, 224),                           # suitable for google/vit-base-patch16-224-in21k AutoImageProcessor which has
                                              # image_processor.size["height"], image_processor.size["width"] = 224, 224
        Image.Resampling.LANCZOS).convert('RGB')

    return img

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [10]:
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, SomeOf, AddBackgroundNoise, PolarityInversion
import numpy as np
import os

# Augmentation policies
augment_policies = {
    'large' : SomeOf(
        transforms= [
            AddGaussianNoise(
                min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(
                min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(
                min_semitones=-4, max_semitones=4, p=0.5),
            Shift(
                min_shift=-0.5, max_shift=0.5, shift_unit='fraction', p=0.5),
            AddBackgroundNoise(
                sounds_path= [os.path.join(BG_NOISE_FOLDER, file) for file in os.listdir(BG_NOISE_FOLDER)],
                min_snr_in_db=3.0, max_snr_in_db=30.0,
                noise_transform=PolarityInversion(),
                p=1.0)],
        num_transforms=4),

    'medium': SomeOf(
        transforms= [
            AddGaussianNoise(
                min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(
                min_rate=0.8, max_rate=1.25, p=0.5),
            PitchShift(
                min_semitones=-4, max_semitones=4, p=0.5),
            Shift(
                min_shift=-0.5, max_shift=0.5, shift_unit='fraction', p=0.5),
            AddBackgroundNoise(
                sounds_path= [os.path.join(BG_NOISE_FOLDER, file) for file in os.listdir(BG_NOISE_FOLDER)],
                min_snr_in_db=3.0, max_snr_in_db=10.0,
                p=1.0)],
        num_transforms=2),

    'small': AddGaussianNoise( min_amplitude=0.001, max_amplitude=0.005, p=0.5)
}

def augment_audio(array):
    return augment_policies[ config['augmenatation'] ]( samples= array, sample_rate= 16_000)

### Libraries for image dataset creation

In [11]:
from pathlib import Path
import librosa
from tqdm import tqdm
import os
from datasets import load_dataset
from multiprocessing import Process, Queue

fma_content_path = '/content'
folder = '.'

def create_track_samples( track_id, dataset):
    # track_id is a six-char string representation of a track id
    # by FMA archive structure, the track file resides in a folder named with 3 first chars
    # and filename = {track_id}.mp3
    f = f"{fma_content_path}/fma_small/{track_id[:3]}/{track_id}.mp3"

    # Create image from audio
    try:
        array, _ = librosa.load(f)
        img = audio2image( array )

        path = f"{folder}/{dataset}/train/{track_id}/"    # label is track_id
        if not os.path.exists(path):
            os.makedirs(path)

        for i in range( config['num_train_augmentations'] ):

            # Generate augmented audios
            augmented_audio = augment_audio( array )

            # Create image from audio
            img = audio2image( augmented_audio )

            # Store image file of augmented audio
            img.save(f"{path}/{track_id}_{i:03d}.png")
    except:
        print(f"Bad file {track_id}")


def create_cls_dataset( dataset_basename: str, cluster_tracks):

    my_dataset = f"{dataset_basename}_{config['augmenatation']}_{config['num_train_augmentations']}"

    if not os.path.exists(f"{folder}/{my_dataset}"):
        os.makedirs(f"{folder}/{my_dataset}")

    # loop over tracks
    for track_id in tqdm(cluster_tracks):
        create_track_samples(track_id, my_dataset)

    # Publish dataset
    ds = load_dataset("imagefolder", data_dir=f"{folder}/{my_dataset}")
    ds.push_to_hub(f"arieg/{my_dataset}")

    return ds

def create_cls_dataset_multiproc( dataset_basename: str, cluster_tracks):

    my_dataset = f"{dataset_basename}_{config['augmenatation']}_{config['num_train_augmentations']}"

    if not os.path.exists(f"{folder}/{my_dataset}"):
        os.makedirs(f"{folder}/{my_dataset}")

    # create all tasks
    batch_size = 10
    for b in range( len(cluster_tracks)//batch_size if len(cluster_tracks)%batch_size == 0 else len(cluster_tracks)//batch_size + 1):

        if b < len(cluster_tracks)//batch_size:
            batch = cluster_tracks[len(cluster_tracks)//batch_size * b : len(cluster_tracks)//batch_size * (b+1)]
        else:
            batch = cluster_tracks[len(cluster_tracks)//batch_size * b : ]
        processes = [Process(target=create_track_samples, args=(track_id, my_dataset,), name=f'{track_id}') for track_id in batch]

        # start all processes
        for process in processes:
            process.start()
        # wait for all processes to complete
        for process in processes:
            process.join()

    # Publish dataset
    ds = load_dataset("imagefolder", data_dir=f"{folder}/{my_dataset}")
    ds.push_to_hub(f"arieg/{my_dataset}")

    return ds

### Mapping of tracks to clusters

In [12]:
import numpy as np
embedding_method = 'siamese'
num_clusters = 80

# I maps indecies to clusters
centroids = np.load(f"/content/drive/MyDrive/Colab Notebooks/Zama/{embedding_method}_centroids.npy")
I = np.load(f"/content/drive/MyDrive/Colab Notebooks/Zama/{embedding_method}_cluster_idxs.npy")

# Make reverse mapping, from cluster number to tarck_ids of that cluster
# First, mad indices to track_ids
from datasets import load_dataset

source_ds = 'fma_small_images'
idx2track = load_dataset(f'arieg/{source_ds}', split='train').features['label'].names

Downloading readme:   0%|          | 0.00/209k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/433M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7997 [00:00<?, ? examples/s]

### Create datasets for training intra-cluster classification

In [None]:
# Dataset for training intra-cluster classification
for cluster_idx in range(0,80):

    # Make a list of track_ids for a given cluster
    # Pick indecies of I pointing to cluster cluster_ids
    # and take track_ids of theses
    tracks = [idx2track[idx] for idx in np.where(I==cluster_idx)[0]]

    num_classes = len(tracks)

    # Build a spectrogram images dataset for classification of the tracks belonging to this cluster
    spec = create_cls_dataset_multiproc( f"cluster{cluster_idx:02d}", tracks)

Resolving data files:   0%|          | 0/23400 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/23400 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/7800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/78 [00:00<?, ?ba/s]

Map:   0%|          | 0/7800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/78 [00:00<?, ?ba/s]

Map:   0%|          | 0/7800 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/78 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/8550 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/8550 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/8550 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/86 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/14700 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/14700 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/7350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/24000 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/24000 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/19800 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/19800 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Map:   0%|          | 0/6600 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/66 [00:00<?, ?ba/s]

Map:   0%|          | 0/6600 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/66 [00:00<?, ?ba/s]

Map:   0%|          | 0/6600 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/66 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/13950 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/13950 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/6975 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/70 [00:00<?, ?ba/s]

Map:   0%|          | 0/6975 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/70 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/11250 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/11250 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/5625 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/57 [00:00<?, ?ba/s]

Map:   0%|          | 0/5625 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/57 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/17100 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/17100 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/8550 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/86 [00:00<?, ?ba/s]

Map:   0%|          | 0/8550 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/86 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/14700 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/14700 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/7350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/14400 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/14400 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Map:   0%|          | 0/7200 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/72 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/19500 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/19500 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/9750 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Map:   0%|          | 0/9750 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/98 [00:00<?, ?ba/s]

Resolving data files:   0%|          | 0/12750 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/12750 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/6375 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/64 [00:00<?, ?ba/s]

Map:   0%|          | 0/6375 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/64 [00:00<?, ?ba/s]

### Create dataset for cluster classification

In [None]:
# Dataset for cluster classification

# Make a list of all track_ids in the library
tracks = [idx2track[idx] for idx in I.flatten().tolist()]

num_classes = len(tracks)

# Build a spectrogram images dataset for classification of the tracks belonging to this cluster
spec = create_cls_dataset( f"cluster_cls", tracks)

 82%|████████▏ | 6528/7997 [5:31:46<1:22:27,  3.37s/it]

### Create end-to-end test datasets

In [None]:
# End-to-end test dataset
config['num_train_augmentations'] = 10    # per track

# clasters for which the inter-cluster classifications training complete
clusters = range(6)

for augm in ['small', 'medium', 'large']:
    config['augmenatation'] = augm

    for cluster_idx in clusters:

        # Make a list of track_ids for a given cluster
        # Pick indecies of I pointing to cluster cluster_ids
        # and take track_ids of theses
        tracks = [idx2track[idx] for idx in np.where(I==cluster_idx)[0]]

        num_classes = len(tracks)

        # Build a spectrogram images dataset for classification of the tracks belonging to this cluster
        spec = create_cls_dataset_multiproc( f"cluster{cluster_idx:02d}", tracks)