Load the one-hot encoding and the mix  
Create a model that takes the mix as input and outputs the one-hot encoding  
Train the model  
Save the model  
Load the model  
Test the model  
Save the test results  
Load the test results  
Plot  

In [40]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split

In [18]:
# Example encoding of a label [1, 1, 0, 1, 0, 0]

slakh_mapping_labels = {
    "Bass": "bass",
    "Chromatic Percussion": "drums",
    "Drums": "drums",
    "Percussive": "drums",
    "Guitar": "guitar",
    "Piano": "piano",
    "Organ": "other",
    "Strings": "other",
    "Strings (continued)": "other",
    "Brass": "other",
    "Reed": "other",
    "Pipe": "other",
    "Synth Lead": "other",
    "Synth Pad": "other",
    "Sound Effects": "other",
    "Ethnic": "other",
    "Sound effects": "other",
}

def map_slakh_labels(label):
    return slakh_mapping_labels[label]

def label_to_index(label):
    return label_names.index(label)

label_names = ["vocal", "guitar", "bass", "drums", "piano","other"]
n_classes = len(label_names)


In [4]:
# Load labels from a CSV file
labels_df = pd.read_csv('datasets/custom/data.csv')

# Example structure of labels_df
# | png_path         | label       |
# |------------------|-------------|
# | dir/file1.wav    | [1,0,0,1,0] |
# | dir/file2.wav    | [0,0,0,1,0] |

In [41]:
from skimage import io


# Function to load an audio file and convert it to a spectrogram
def load_spectogram_file(file_path):
    # Load specrogram png image using skimage
    spectrogram = io.imread(file_path, as_gray=True)
    return spectrogram


# Load labels from a CSV file
labels_df = pd.read_csv('datasets/custom/data.csv')

print(labels_df)

# Directory containing spectogram files
spectrogram_dir = 'datasets/custom/'

# Initialize lists to hold data and labels
data = []
labels = []

# Iterate over each row in the labels DataFrame
for index, row in labels_df.iterrows():
    spectrogram = load_spectogram_file(row['png_path'])
    data.append(spectrogram)
    label = np.fromstring(row['label'].strip("[]"), sep=' ')
    labels.append(label)

# Convert lists to numpy arrays
data = np.array(data)
labels = np.array(labels)

# Example shapes
print(data.shape)  # (num_samples, height, width, 1)
print(labels.shape)  # (num_samples, num_classes)

# Split data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=0.1, random_state=42)


                               png_path                label
0      datasets/custom/Track00006_0.png  [0. 0. 0. 0. 0. 1.]
1      datasets/custom/Track00006_5.png  [0. 0. 0. 0. 0. 1.]
2     datasets/custom/Track00006_10.png  [0. 0. 0. 1. 1. 1.]
3     datasets/custom/Track00006_15.png  [0. 0. 1. 1. 1. 1.]
4     datasets/custom/Track00006_20.png  [0. 1. 1. 1. 1. 1.]
..                                  ...                  ...
959  datasets/custom/Track00020_320.png  [0. 1. 1. 1. 1. 1.]
960  datasets/custom/Track00020_325.png  [0. 1. 1. 1. 1. 1.]
961  datasets/custom/Track00020_330.png  [0. 1. 1. 1. 1. 1.]
962  datasets/custom/Track00020_335.png  [0. 1. 1. 1. 1. 1.]
963  datasets/custom/Track00020_340.png  [0. 0. 0. 0. 0. 1.]

[964 rows x 2 columns]
(964, 128, 157)
(964, 6)


In [6]:
def add_conv_block(model, num_filters=32, dropout_rate=0.25):
    model.add(keras.layers.Conv2D(num_filters, kernel_size=(3, 3), activation='relu'))
    model.add(keras.layers.Conv2D(num_filters, kernel_size=(3, 3), activation='relu'))
    model.add(keras.layers.MaxPooling2D(pool_size=(2, 2)))
    model.add(keras.layers.Dropout(dropout_rate))
    return model

In [35]:
import tensorflow.keras.backend as K

def hamming_loss(y_true, y_pred):
  return K.mean(y_true*(1-y_pred)+(1-y_true)*y_pred)

def hamming_loss_1(y_true, y_pred):
  tmp = K.abs(y_true-y_pred)
  return K.mean(K.cast(K.greater(tmp,0.5),dtype=float))

def subset_accuracy(y_true, y_pred):
    # From https://stackoverflow.com/questions/56739708/how-to-implement-exact-match-subset-accuracy-as-a-metric-for-keras

    threshold = tf.constant(.5, tf.float32)
    gtt_pred = tf.math.greater(y_pred, threshold)
    gtt_true = tf.math.greater(y_true, threshold)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(gtt_pred, gtt_true), tf.float32), axis=-1)
    return accuracy

In [43]:
model = keras.Sequential()
model.add(keras.Input(shape=(data.shape[1], data.shape[2], 1)))

add_conv_block(model, num_filters=32, dropout_rate=0.25)
add_conv_block(model, num_filters=64, dropout_rate=0.35)
add_conv_block(model, num_filters=128, dropout_rate=0.45)
add_conv_block(model, num_filters=256, dropout_rate=0.5)

model.add(keras.layers.Dense(512))
model.add(keras.layers.Dropout(0.75))

model.add(keras.layers.Dense(128))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(n_classes, activation='sigmoid'))

model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])
#Class Imbalance: If some instruments are much more common than others, consider using class weights to balance the loss function.

model.summary()

In [44]:
# history = model.fit(x=data, y=labels, batch_size=32, epochs=10, validation_split=0.2)
history = model.fit(x=train_data, y=train_labels, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 159ms/step - accuracy: 0.1096 - loss: 2.5290 - val_accuracy: 0.0172 - val_loss: 0.4780
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - accuracy: 0.0614 - loss: 0.3926 - val_accuracy: 0.0172 - val_loss: 0.4119
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 125ms/step - accuracy: 0.0464 - loss: 0.3671 - val_accuracy: 0.0172 - val_loss: 0.3798
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 124ms/step - accuracy: 0.0425 - loss: 0.3552 - val_accuracy: 0.0172 - val_loss: 0.3807
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 130ms/step - accuracy: 0.0385 - loss: 0.3662 - val_accuracy: 0.0172 - val_loss: 0.3651
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 127ms/step - accuracy: 0.0366 - loss: 0.3590 - val_accuracy: 0.0172 - val_loss: 0.3698
Epoch 7/10
[1m22/22[0m [3