In [11]:
## from time import time
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import time
from tensorflow import keras
import tensorflow as tf
import numpy as np


#ROOT_FOLDER = r"D/Users/david.isaacspaternostro/broncode/rimworld-of-sound"
ROOT_FOLDER = r"C:\Users\david.isaacspaternos\broncode\data\stft"
LABEL = 'instrument_and_pitch_single_label'
BATCH_SIZE = 64
EPOCHS = 50
LEARNING_RATE = 1e-5
modelsavename = 'instrument_and_pitch_single_label_model_' + str(int(time.time()))
print(modelsavename)

label_shapes = dict(
    instrument=1,
    instrument_subtype=33,
    pitch=128,
    instrument_subtype_and_pitch=5+112,  # 4 instruments, 1 other, 112 pitches
    instrument_and_pitch_single_label=5*112,
)

def get_label(filename, label_type, label_size):
    switch = {
        "instrument": get_instrument_label,
        "instrument_subtype": get_instrument_subtype_label,
        "pitch": get_pitch_label,
        "instrument_subtype_and_pitch": get_multi_label,
        "instrument_and_pitch_single_label": get_instrument_and_pitch
    }
    if label_type != "instrument_and_pitch_single_label":
        sparse_label = switch[label_type](filename)
        label = np.zeros((label_size, 1))
        label[sparse_label] = 1
    else:
        label = switch[label_type](filename)
    return label

def get_instrument_label(filename):
    instrument = "_".join(filename.split('_')[:-2])
    switch = {
        'bass': 0,
        'brass': 1,
        'flute': 2,
        'guitar': 3,
        'keyboard': 4,
        'mallet': 5,
        'organ': 6,
        'reed': 7,
        'string': 8,
        'synth_lead': 9,
        'vocal': 10,
    }
    return switch[instrument]

def get_instrument_subtype_label(filename):
    instrument_label = get_instrument_label(filename)
    subtype = filename.split('_')[-2]
    switch = {
        "acoustic": 0,
        "electronic": 1,
        "synthetic": 2
    }
    label = instrument_label * len(switch) + switch[subtype]
    return label


def get_pitch_label(filename):
    label = int(filename.split('-')[1])
    return label

import time
def get_multi_label(filename):
    """ multi label with best recognizable instruments
        bass_electronic, vocal acoustic, organ electronic, string acoustic, other_instruments, noise(when added to dataset), pitch """
    pitch_label = get_pitch_label(filename)
    instrument_label = get_instrument_subtype_label(filename)

    if instrument_label == 1:
        instrument_label = 0      #"bass_electronic"
    elif instrument_label == 19:
        instrument_label = 1      #"organ_electronic"
    elif instrument_label == 24:
        instrument_label = 2      #"string_acoustic"
    elif instrument_label == 30:
        instrument_label = 3      #"vocal_acoustic"
    else:
        instrument_label = 4      #"other"

    label = np.zeros(5+112) # 5 for instruments, 112 for pitches, lowest = 9, highest is 120 (check vocal synthetic, it has them both)
    label[instrument_label] = 1
    try:
        label[pitch_label+5-9] = 1    # +5 because first 5 are instruments, -9 because 009 is the lowest pitch in the nsynth dataset
    except:        
        print(filename)
        print(label)
        time.sleep(10)
        
    return label  

def get_instrument_and_pitch(filename):
    pitch_label = get_pitch_label(filename)
    instrument_label = get_instrument_subtype_label(filename)

    if instrument_label == 1:
        instrument_label = 0      #"bass_electronic"
    elif instrument_label == 19:
        instrument_label = 1      #"organ_electronic"
    elif instrument_label == 24:
        instrument_label = 2      #"string_acoustic"
    elif instrument_label == 30:
        instrument_label = 3      #"vocal_acoustic"
    else:
        instrument_label = 4      #"other"
        
    label = np.zeros(5*112)
    label[instrument_label*112 + pitch_label-9] = 1
#     print(filename)
#     print(np.argmax(label))
#     time.sleep(10)
    
    return label

def reset(batch_size, label_size):
    imgs = np.zeros((batch_size, 126, 1025, 1))
    labels = np.zeros((batch_size, label_size))
    return imgs, labels

def get_dataset(path, label_type, label_size, batch_size):
    filenames = [f for r, d, fs in os.walk(path) for f in fs]  # tf uses os.walk to determine file order
    labels = [get_label(filename, label_type, label_size) for filename in filenames]
    dataset = tf.keras.preprocessing \
        .image_dataset_from_directory(
            directory=path,
            labels=labels,
            color_mode='grayscale',
            batch_size=batch_size,
            image_size=(126, 1025)
        )
    return dataset

# Required folder structure:
# ROOT_FOLDER\train\anything\all_your_imgs.png
# ROOT_FOLDER\valid\anything\all_your_imgs.png

model = keras.Sequential([
    keras.layers.Input(shape=(126, 1025, 1)),
    keras.layers.Conv2D(8, kernel_size=(5, 10), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(3, 10)),
    keras.layers.Conv2D(16, kernel_size=(5, 10), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=(3, 10)),
    keras.layers.Flatten(),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(label_shapes[LABEL], activation='softmax'),
])

model.summary()



train_folder = os.path.join(ROOT_FOLDER, 'train')
train_dataset = get_dataset(train_folder, LABEL, label_shapes[LABEL], BATCH_SIZE)
valid_folder = os.path.join(ROOT_FOLDER, 'valid')
valid_dataset = get_dataset(valid_folder, LABEL, label_shapes[LABEL], BATCH_SIZE)

# metric = tf.keras.metrics.Precision(thresholds=None, top_k=None, class_id=None, name=None, dtype=None)


#model.fit(train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[model_checkpoint_callback, early_stop])
model = tf.keras.models.load_model('instrument_and_pitch_single_label_model_1621681850')
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="checkpoints/"+modelsavename,
    save_weights_only=True,
    monitor='val_loss',
    mode='max',
    save_best_only=True)


adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics='categorical_accuracy')
model.fit(train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[model_checkpoint_callback, early_stop])

model.save(modelsavename)

instrument_and_pitch_single_label_model_1621692125
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 122, 1016, 8)      408       
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 40, 101, 8)        0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 36, 92, 16)        6416      
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 12, 9, 16)         0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 1728)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 512)               885248    
_________________________________________________________________
den

In [12]:
import keras
from glob import glob
from keras.layers import Input, Dense, Add, BatchNormalization, Conv2D, ReLU, MaxPool2D, Flatten
from keras.models import Model


def relu_bn(inputs):
    relu = ReLU()(inputs)
    bn = BatchNormalization()(relu)
    return bn
def residual_block(x, downsample: bool, filters: int,                                        kernel_size: int = 3):
    y = Conv2D(kernel_size=kernel_size,
               strides= (1 if not downsample else 2),
               filters=filters,
               padding="same")(x)
    y = relu_bn(y)
    y = Conv2D(kernel_size=kernel_size,
               strides=1,
               filters=filters,
               padding="same")(y)
    if downsample:
        x = Conv2D(kernel_size=1,
                   strides=2,
                   filters=filters,
                   padding="same")(x)
    out = Add()([x, y])
    out = relu_bn(out)
    return out
def create_res_net():
    inputs = Input(shape=(126, 1025, 1))
    num_filters = 16
    t = BatchNormalization()(inputs)
    t = Conv2D(kernel_size=3,
               strides=1,
               filters=num_filters,
               padding="same")(t)
    t = relu_bn(t)
    num_blocks_list = [2, 5,2]
    for i in range(len(num_blocks_list)):
        num_blocks = num_blocks_list[i]
        for j in range(num_blocks):
            t = residual_block(t, downsample=(j==0 and i!=0), filters=num_filters)
        num_filters *= 2
    t = MaxPool2D(4)(t)
    t = Flatten()(t)
    outputs = Dense(label_shapes[LABEL], activation='sigmoid')(t)
    model = Model(inputs, outputs)
    
    metric = tf.keras.metrics.Precision(thresholds=None, top_k=None, class_id=None, name=None, dtype=None)
    opt = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=metric)
    return model


    
label_shapes = dict(
    instrument=1,
    instrument_subtype=33,
    pitch=128,
    instrument_subtype_and_pitch=5+112  # 4 instruments, 1 other, 112 pitches
)

train_folder = os.path.join(ROOT_FOLDER, 'train')
train_dataset = get_dataset(train_folder, LABEL, label_shapes[LABEL], BATCH_SIZE)
valid_folder = os.path.join(ROOT_FOLDER, 'valid')
valid_dataset = get_dataset(valid_folder, LABEL, label_shapes[LABEL], BATCH_SIZE)
model = create_res_net()

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="checkpoints/"+modelsavename,
    save_weights_only=True,
    monitor='val_loss',
    mode='max',
    save_best_only=True)

model.fit(train_dataset, epochs=EPOCHS, validation_data=valid_dataset, callbacks=[model_checkpoint_callback, early_stop])
model.save(modelsavename)