## Train intra-cluster classification models

In [None]:
!pip install -q datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import huggingface_hub
huggingface_hub.login()

### Dataset load and postrpocessing

In [None]:
from datasets import load_dataset, DatasetDict

def load_cls_dataset( cluster_id):
    my_dataset = f"cluster{cluster_id:02d}_{config['augmenatation']}_{config['num_train_augmentations']}"
    return load_dataset(f"arieg/{my_dataset}")


def postprocess_cls_dataset( spec ):

    # Split into:
    # train      90%
    # validation  5%
    # test        5%
    # following https://stackoverflow.com/questions/76001128/splitting-dataset-into-train-test-and-validation-using-huggingface-datasets-fun
    spec = spec.train_test_split(test_size=0.1)

    data_train = spec['train']
    data_val_test = spec['test']

    data_val_test = data_val_test.train_test_split(test_size=0.5)

    data_splitted = DatasetDict({
        'train': data_train,                         # 90%
        'validation': data_val_test['train'],        #  5%
        'test': data_val_test['test']                #  5%
    })

    num_training_samples = len(data_train)
    num_validation_samples = len(data_val_test['train'])

    labels = data_train.features["label"].names
    assert num_classes == len(labels)

    batch_size = 16

    data_splitted.set_format('tf')

    return [
        data_splitted[split].to_tf_dataset(columns="image", label_cols="label", shuffle=False, batch_size=batch_size)
        for split in ['train', 'validation', 'test']
    ]

#### LIbraries for model creation

In [None]:
import tensorflow as tf
import keras
from keras import layers
import tensorflow_datasets as tfds

from keras.optimizers import Adam
from keras.losses import binary_crossentropy
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback

from keras.applications.vgg19 import VGG19
from keras.applications.vgg19 import preprocess_input

# image dimensions
rows = 224
cols = 224

def create_cls_model( num_classes):
    # create a vgg19 model and discard the top layer
    model_vgg19 = VGG19(include_top=False, weights='imagenet', input_shape=(rows, cols, 3))

    # following https://keras.io/guides/transfer_learning/
    model_vgg19.trainable = False

    # Create a custom model
    model = tf.keras.Sequential()

    # Preprocessing layers
    # following https://keras.io/guides/transfer_learning/#an-endtoend-example-finetuning-an-image-classification-model-on-a-cats-vs-dogs-dataset

    # Input image size as in the dataset
    model.add( layers.InputLayer(input_shape=(224, 298, 3)))
    model.add( layers.RandomCrop(rows, cols))                      # 224, 224
    model.add( layers.Rescaling(scale=1.0 / 127.5, offset=-1))

    # Copy vgg19 layers into our model
    for layer in model_vgg19.layers:
        model.add(layer)

    # Add top layers
    # top layers architecture follows:
    #   https://iq.opengenus.org/vgg19-architecture/
    #   https://www.kaggle.com/code/willstone98/transfer-learning-fine-tuning-with-vgg16
    # droput:
    #   https://keras.io/guides/transfer_learning/#an-endtoend-example-finetuning-an-image-classification-model-on-a-cats-vs-dogs-dataset
    model.add( layers.Flatten())
    model.add( layers.Dense(2048, activation='relu'))
    model.add( layers.Dropout(0.2))
    model.add( layers.Dense(2048, activation='relu'))
    model.add( layers.Dense(num_classes, activation='softmax'))

    # Compile model
    model.compile(
        optimizer= keras.optimizers.SGD(learning_rate=0.001),
        loss= keras.losses.SparseCategoricalCrossentropy(from_logits=False),

        metrics = [keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

### Load mapping of tracks to clusters






In [None]:
import numpy as np
embedding_method = 'siamese'
num_clusters = 80

# I maps indecies to clusters
centroids = np.load(f"/content/drive/MyDrive/Colab Notebooks/Zama/{embedding_method}_centroids.npy")
I = np.load(f"/content/drive/MyDrive/Colab Notebooks/Zama/{embedding_method}_cluster_idxs.npy")

# Make reverse mapping, from cluster number to tarck_ids of that cluster
# First, mad indices to track_ids
from datasets import load_dataset

source_ds = 'fma_small_images'
idx2track = load_dataset(f'arieg/{source_ds}', split='train').features['label'].names

Downloading readme:   0%|          | 0.00/209k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/433M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7997 [00:00<?, ? examples/s]

### For each cluster

In [None]:
for cluster_idx in range(num_clusters):

    # Make a list of track_ids for a given cluster
    # Pick indecies of I pointing to cluster cluster_ids
    # and take track_ids of theses
    tracks = [idx2track[idx] for idx in np.where(I==cluster_idx)[0]]
    print(tracks)

    num_classes = len(tracks)

    # Build a spectrogram images dataset for classification of the tracks belonging to this cluster
    #spec = create_cls_dataset( cluster_idx, tracks)
    spec = load_cls_dataset( cluster_idx)

    # Postprocess dataset
    tf_train_dataset, tf_eval_dataset, tf_test_dataset = postprocess_cls_dataset( spec['train'] )

    # Make a classification model for this cluster
    model = create_cls_model( num_classes )

    # Training
    epochs = 20
    history = model.fit(tf_train_dataset, epochs=epochs, validation_data=tf_eval_dataset, callbacks=[], verbose=1)

    # Store top classification layers' weights for end-to-end evaluation
    for i, lyr in enumerate([-4, -2, -1]):
        weights, biases = model.layers[lyr].get_weights()
        np.save(f"/content/drive/MyDrive/Colab Notebooks/Zama/w_cluster{cluster_idx:02d}_top{i+1}", weights)
        np.save(f"/content/drive/MyDrive/Colab Notebooks/Zama/b_cluster{cluster_idx:02d}_top{i+1}", biases)