In [1]:
# Final project final code

In [2]:
!pip install openunmix

Collecting openunmix
  Using cached openunmix-1.2.1-py3-none-any.whl (46 kB)
Installing collected packages: openunmix
Successfully installed openunmix-1.2.1


In [3]:
!pip install soundfile



In [4]:
!pip install tensorflow

Collecting tensorflow
  Using cached tensorflow-2.8.0-cp39-cp39-manylinux2010_x86_64.whl (497.6 MB)
Collecting google-pasta>=0.1.1
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting libclang>=9.0.1
  Using cached libclang-13.0.0-py2.py3-none-manylinux1_x86_64.whl (14.5 MB)
Collecting absl-py>=0.4.0
  Using cached absl_py-1.0.0-py3-none-any.whl (126 kB)
Collecting grpcio<2.0,>=1.24.3
  Using cached grpcio-1.44.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
Collecting keras-preprocessing>=1.1.1
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting keras<2.9,>=2.8.0rc0
  Using cached keras-2.8.0-py2.py3-none-any.whl (1.4 MB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Using cached tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
Collecting opt-einsum>=2.3.2
  Using cached opt_einsum-3.3.0-py3-none-any.whl (65 kB)
Collecting termcolor>=1.1.0
  Using cached termcolor-1.1.0-py3-none-any.whl
Co

In [5]:
!pip install tensorflow_io

Collecting tensorflow_io
  Using cached tensorflow_io-0.24.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (23.4 MB)
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.24.0


In [6]:
# Import dependencies

# import torch for use with Open-Unmix
import torch
import torchaudio

# import tensorflow for CNN architecture
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import scipy
import os
from IPython.display import Audio, display

import librosa
import sklearn
import math
import torch.nn as nn
import torch.nn.functional as nnF
from tensorflow.keras import layers, models

import soundfile
import librosa.display
import matplotlib.pyplot as plt
from librosa.feature import melspectrogram as mel_spec

from openunmix import predict as UMXpredict

# set to GPU if available
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [7]:
# TRAIN/BUILD CNN

# path to IRMAS training data 
train_ds  = os.path.expanduser("~/teaching_material/own_work/train_data")

# path to drums training data
drum_ds  = os.path.expanduser("~/drum_data/")

In [8]:
# function to shuffle two arrays in same permutation
def shuffled_arrays(x, y):
    p = np.random.permutation(x.shape[0])
    return x[p], y[p]

In [9]:
# function to convert training data into spectrograms
def convert_audio_to_spectrogram(infile, win_size = 0.05, hop_size = 0.025, n_mels = 64):
   
    # load audio file and get sample rate
    snd, fs = librosa.load(infile)
    
    # compute mel spec parameters
    win_len = int(win_size * fs)
    hop_len = int(hop_size * fs)
    
    # compute mel spectrogram from raw audio
    spec = mel_spec(snd, sr=fs, n_mels=n_mels, hop_length=hop_len, 
                                          win_length=win_len, window='hann')
    
    # rescale output range to be logarithmic
    # corresponds to human perception
    spec = tfio.audio.dbscale(spec, top_db=80)
    
    # augment spectrogram in terms of frequency
    augmented_spec = tfio.audio.freq_mask(spec, param=10)
    
    # convert to original spectrogram to tensor
    # add new axis to account for channels
    spec = spec[..., tf.newaxis]
    spec = tf.convert_to_tensor(spec)
    # reshape to tensor shape
    spec = tf.reshape(spec, (64, 121, 1))
    
    # convert to augmented spectrogram to tensor
    augmented_spec = augmented_spec[..., tf.newaxis]
    augmented_spec = tf.convert_to_tensor(augmented_spec)
    augmented_spec = tf.reshape(augmented_spec, (64, 121, 1))
    
    # output both
    return spec, augmented_spec

In [10]:
# function to get training data labels from file name
def get_label(infile):
    
    # get section of file name associated with instrument
    file_name = infile[1:4] 

    # encode as one-hot vectors
    if file_name == 'cel':
        label = [1,0,0,0,0,0,0,0,0,0,0,0,0]
        
    elif file_name == 'cla':
        label = [0,1,0,0,0,0,0,0,0,0,0,0,0]
        
    elif file_name == 'flu':
        label = [0,0,1,0,0,0,0,0,0,0,0,0,0]
        
    elif file_name == 'gac':
        label = [0,0,0,1,0,0,0,0,0,0,0,0,0]
        
    elif file_name == 'gel':
        label = [0,0,0,0,1,0,0,0,0,0,0,0,0]
        
    elif file_name == 'org':
        label = [0,0,0,0,0,1,0,0,0,0,0,0,0]
        
    elif file_name == 'pia':
        label = [0,0,0,0,0,0,1,0,0,0,0,0,0]
        
    elif file_name == 'sax':
        label = [0,0,0,0,0,0,0,1,0,0,0,0,0]
        
    elif file_name == 'tru':
        label = [0,0,0,0,0,0,0,0,1,0,0,0,0]
        
    elif file_name == 'vio':
        label = [0,0,0,0,0,0,0,0,0,1,0,0,0]
        
    elif file_name == 'voi':
        label = [0,0,0,0,0,0,0,0,0,0,1,0,0]
        
    else:
        label = [0,0,0,0,0,0,0,0,0,0,0,0,1]
    
    return label

In [11]:
# store labels names as strings in an array for output use
label_names = ['Cello', 'Clarinet', 'Flute', 'Acoustic Guitar', 'Electric Guitar', 
               'Organ', 'Piano', 'Sax', 'Trumpet', 'Violin', 'Voice', 'Drums', 'Other']

In [12]:
# function to convert vectorised labels back to words
def vector_to_label(vector):
    
    word = "Undefined"
    
    # check one hot vector values for label in words
    if vector[0] == 1:
        word = "Cello"
        
    elif vector[1] == 1:
        word = "Clarinet"
        
    elif vector[2] == 1:
        word = "Flute"
            
    elif vector[3] == 1:
        word = "Acoustic Guitar"
                
    elif vector[4] == 1:
        word = "Electric Guitar"
                    
    elif vector[5] == 1:
        word = "Organ"
                        
    elif vector[6] == 1:
        word = "Piano"
                            
    elif vector[7] == 1:
        word = "Sax"
                                
    elif vector[8] == 1:
        word = "Trumpet"
                                    
    elif vector[9] == 1:
        word = "Violin"
                                        
    elif vector[10] == 1:
        word = "Voice"
        
    elif vector[11] == 1:
        word = "Drums"

    return word

In [13]:
# check total number of training samples
no_of_entries = len([name for name in os.listdir(train_ds)])
print(no_of_entries)

3261


In [14]:
# initialise an array for storing output spectrogram tensors
spectrograms = np.zeros((no_of_entries * 2, 64, 121, 1))

# initialise counter for counting number of spectrograms calculated
# also used for location for storing spectrograms
counter = 0

# scan through training data directory and compute spectrograms
for entry in os.scandir(train_ds):
    
    # if file is not .wav then skip
    if str(os.listdir(train_ds)[counter][-3:]) != 'wav':
        continue
    
    # add output spectrograms to spectrogram array
    spec, augmented_spec = convert_audio_to_spectrogram(entry)
    spectrograms[counter] = spec
    spectrograms[counter + 1] = augmented_spec
    # advance counter
    counter += 2

2022-04-13 11:15:51.693037: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-13 11:16:09.437626: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3728 MB memory:  -> device: 0, name: NVIDIA A40, pci bus id: 0000:17:00.0, compute capability: 8.6
2022-04-13 11:16:09.439361: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 41734 MB memory:  -> device: 1, name: NVIDIA A40, pci bus id: 0000:65:00.0, compute capability: 8.6
2022-04-13 11:16:09.441080: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 4

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750


In [15]:
# initalise array for storing label data
labels = np.zeros((no_of_entries * 2, 13))

# counter for inputting labels in array
label_counter = 0

for name in os.listdir(train_ds):
    
    # skip file if not .wav
    if str(os.listdir(train_ds)[label_counter][-3:]) != 'wav':
        continue
    
    # output labels to array
    labels[label_counter] = get_label(name)
    labels[label_counter + 1] = get_label(name)    
    # advance counter
    label_counter += 2

In [16]:
# get spectrograms and label arrays to correct length
spectrograms = spectrograms[:counter]
labels = labels[:label_counter]

# get total number of samples from spectrogram counter
total_length = counter

In [17]:
# initialise array for storing drum spectrograms
drum_spectrograms = []

# loop through each file in the directory
for drum_file in os.scandir(drum_ds):
    
    # initialise counter to zero
    spec_counter = 0
    
    # load audio
    snd, sr = librosa.load(drum_file)
    
    # calculate a three second window 
    # 3 seconds should match IRMAS data
    three_seconds = 3 * sr
    
    # set spectrogram variables
    n_mels = 64
    win_len = int(0.05 * sr)
    hop_len = int(0.025 * sr)
    
    # loop through length of audio
    for i in range(len(snd)):
        
        # only calulate enough spectrograms to keep dataset balanced
        if spec_counter > 13:
            break
        
        # take splits of audio at three seconds
        if (i % three_seconds) == 0:

            # compute mel spectrogram from raw audio
            spec = mel_spec(snd[i:i+three_seconds], sr=sr, n_mels=n_mels, hop_length=hop_len, 
                                                  win_length=win_len, window='hann')
            
            # advance spectrogram counter
            spec_counter += 1

            # if spectrogram is too small, skip over
            if(spec.shape[1] < 121):
                continue
            
            # rescale output range to be logarithmic
            # corresponds to human perception
            spec = tfio.audio.dbscale(spec, top_db=80)

            # augment spectrogram in terms of frequency
            augmented_spec = tfio.audio.freq_mask(spec, param=10)

            # convert to original spectrogram to tensor
            # add new axis to account for channels
            spec = spec[..., tf.newaxis]
            spec = tf.convert_to_tensor(spec)
            # reshape to tensor shape
            spec = tf.reshape(spec, (64, 121, 1))
            
            # convert to augmented spectrogram to tensor
            augmented_spec = augmented_spec[..., tf.newaxis]
            augmented_spec = tf.convert_to_tensor(augmented_spec)
            augmented_spec = tf.reshape(augmented_spec, (64, 121, 1))
           
            # output spectrograms to array
            drum_spectrograms.append(spec)
            drum_spectrograms.append(augmented_spec)
            



520


In [19]:
# initialise array for storing drum labels
drum_labels = np.zeros((len(drum_spectrograms), 13))

# attache one hot vector for evbery drum spectrogram
for i in range(len(drum_spectrograms)):
    drum_labels[i] = [0,0,0,0,0,0,0,0,0,0,0,1,0]
    

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


In [34]:
# shuffle drum spectrograms 
drum_spectrograms = np.array(drum_spectrograms)
shuffled_drum_spectrograms, shuffled_drum_labels = shuffled_arrays(drum_spectrograms, drum_labels)

# keep first 250 spectrograms to keep dataset balanced
shuffled_drum_spectrograms = shuffled_drum_spectrograms[:250]
drum_labels = drum_labels[:250]

In [36]:
# get largest value from across spectrograms
most_max_regular = np.amax(spectrograms)

# normalise all specs according to maximum value
normalised_specs = np.abs(spectrograms // most_max_regular)
# if any spectrograms NAN, replace with 0
normalised_specs = np.where(np.isnan(normalised_specs), 0, normalised_specs)

# same for drums - get largest value
most_max_drums = np.amax(shuffled_drum_spectrograms)

# normalise all specs according to maximum value
normalised_drum_specs = np.abs(shuffled_drum_spectrograms // most_max_drums)
# if any spectrograms NAN, replace with 0
normalised_drum_specs = np.where(np.isnan(normalised_drum_specs), 0, normalised_drum_specs)

66.93445587158203
67.121185


  normalised_drum_specs = np.abs(shuffled_drum_spectrograms // most_max_drums)


In [37]:
# concatenate IRMAS and drum data sepctrograms into one array
all_spectrograms = [normalised_specs, normalised_drum_specs]
all_spectrograms = np.concatenate(all_spectrograms)

# concatenate IRMAS and drum data labels into one array
all_labels = [labels, drum_labels]
all_labels = np.concatenate(all_labels)

# print shapes for checking/debugging
print(all_spectrograms.shape)
print(all_labels.shape)

(3044, 64, 121, 1)
(3044, 13)


In [38]:
# shuffle spectrograms and labels with same permutation 
suffled_spectrograms, shuffled_labels = shuffled_arrays(all_spectrograms, all_labels)

In [39]:
# split data into train and test
# make train data 80% of all data
train_specs = suffled_spectrograms[:int(total_length * 0.8)]

# val data is 10%
val_specs = suffled_spectrograms[int(total_length * 0.8):int(total_length * 0.9)]

# test data is the last 10%
test_specs = suffled_spectrograms[int(total_length * 0.9):total_length]

# split labels same way as spectrograms
train_labels = shuffled_labels[:int(total_length * 0.8)]
val_labels = shuffled_labels[int(total_length * 0.8):int(total_length * 0.9)]
test_labels = shuffled_labels[int(total_length * 0.9):total_length]

# convert spectrograms into tensor for training
train_dataset = tf.data.Dataset.from_tensor_slices(train_specs)

In [44]:
# BUILD CNN MODEL

# use He initialisation
initializer = tf.keras.initializers.HeNormal()

# set up model as sequential keras model
model = models.Sequential()

# add convolutional layer followed by max pooling
model.add(layers.Conv2D(32, (2, 2), activation='tanh', input_shape=(64, 121, 1), kernel_initializer=initializer))
model.add(layers.MaxPooling2D((2, 2)))

# add convolutional layer followed by max pooling
model.add(layers.Conv2D(64, (2, 2), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(layers.MaxPooling2D((2, 2)))

# add convolutional layer followed by max pooling
model.add(layers.Conv2D(128, (2, 2), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(layers.MaxPooling2D((2, 2)))

# normalise the batch 
model.add(layers.BatchNormalization(momentum = 0.9))

# add convolutional layer - this is ususally 16 not 128
model.add(layers.Conv2D(128, (2, 2), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))

# flatten
model.add(layers.Flatten())

# three dense layers in sequence to 12 possibel outputs
model.add(layers.Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(layers.Dense(512, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
# final dense layer with 13 outputs - each output corresponds to possible label
model.add(layers.Dense(13, activation='softmax'))

# summarise models
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_4 (Conv2D)           (None, 63, 120, 32)       160       
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 31, 60, 32)       0         
 2D)                                                             
                                                                 
 conv2d_5 (Conv2D)           (None, 30, 59, 64)        8256      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 15, 29, 64)       0         
 2D)                                                             
                                                                 
 conv2d_6 (Conv2D)           (None, 14, 28, 128)       32896     
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 7, 14, 128)      

In [46]:
# TRAIN THE NETWORK

# initialise optimizer
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.0001, momentum = 0.1)

# compile the model
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy']
)

# save model checkpoints
callbacks = [
             tf.keras.callbacks.ModelCheckpoint(
                  filepath='./training_checkpoints/ckpt_{epoch}',
                  save_weights_only=True
             ),
]

# train the model iver 50 epochs
history = model.fit(train_specs, train_labels, epochs=50, callbacks=callbacks,
                   validation_data=(val_specs, val_labels))


Epoch 1: LearningRateScheduler setting learning rate to 1e-05.
Epoch 1/50

Epoch 2: LearningRateScheduler setting learning rate to 1e-05.
Epoch 2/50

Epoch 3: LearningRateScheduler setting learning rate to 1e-05.
Epoch 3/50

Epoch 4: LearningRateScheduler setting learning rate to 1e-05.
Epoch 4/50

Epoch 5: LearningRateScheduler setting learning rate to 1e-05.
Epoch 5/50

Epoch 6: LearningRateScheduler setting learning rate to 1e-05.
Epoch 6/50

Epoch 7: LearningRateScheduler setting learning rate to 1e-05.
Epoch 7/50

Epoch 8: LearningRateScheduler setting learning rate to 1e-05.
Epoch 8/50

Epoch 9: LearningRateScheduler setting learning rate to 1e-05.
Epoch 9/50

Epoch 10: LearningRateScheduler setting learning rate to 1e-05.
Epoch 10/50

Epoch 11: LearningRateScheduler setting learning rate to 1e-05.
Epoch 11/50

Epoch 12: LearningRateScheduler setting learning rate to 1e-05.
Epoch 12/50

Epoch 13: LearningRateScheduler setting learning rate to 1e-05.
Epoch 13/50

Epoch 14: Learni

In [47]:
# EVAULATION PORTION OF THE MODEL

# get predictions of test spectrograms 
predictions = model.predict(test_specs)

In [49]:
# function to calculate accuracies of the model
# outputs true positive, false positives, true negatives and false negatives
# compares ground truths to predictions for specified instrument
def counters(ground_truths, predictions, instrument):
    
    # initialise counters for each outcome
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    
    # cycle through output predictions
    for i in range(len(predictions)):
        
        # check if prediction matches specified instrument
        if predictions[i] == instrument:
            
            # if ground truth matches instrument, add one to true positives
            if ground_truths[i] == instrument:
                TP += 1
                
            # else if ground truth does not match instrument, add one to false positives
            if ground_truths[i] != instrument:
                FP += 1
            
        # check if prediction does not match specified instrument
        if predictions[i] != instrument:
            
            # if ground truth also does not match instrument, add one to true negatives
            if ground_truths != instrument:
                TN += 1
            
            # else if ground truth is instrument, add one to false negatives
            if ground_truths[i] == instrument:
                FN += 1
            
    return TP, FP, TN, FN

In [50]:
# get number of predictions made
no_of_predictions = len(predictions)

# intialise arrays for storing labels and classifications as words
true_labels = []
classification = []

# loop through all predictions
for i in range(no_of_predictions):  
    # convert vector labels to word labels
    new_true = vector_to_label(test_labels[i])
    true_labels.append(new_true)
    
    # get index where vector equals one
    # vector should be 1-hot encoded
    label_no = np.argmax(predictions[i])
    # output word classification to array
    classification.append(label_names[label_no])
    
# check stats for classification vs ground truth for each instrument
for label in label_names:
    TP, FP, TN, FN = counters(true_labels, classification, label)

    # print results
    print(label, "TP:", TP)
    print(label, "FP:",  FP)
    print(label, "TN:", TN)
    print(label, "FN:", FN)
    print(" ")

False Positive for:  Cello
Sax
False Positive for:  Cello
Violin
False Positive for:  Cello
Acoustic Guitar
False Positive for:  Cello
Acoustic Guitar
False Positive for:  Cello
Sax
False Positive for:  Cello
Sax
Cello TP: 20
Cello FP: 6
Cello TN: 254
Cello FN: 7
 
False Positive for:  Clarinet
Piano
False Positive for:  Clarinet
Acoustic Guitar
False Positive for:  Clarinet
Electric Guitar
False Positive for:  Clarinet
Flute
False Positive for:  Clarinet
Violin
False Positive for:  Clarinet
Violin
False Positive for:  Clarinet
Trumpet
False Positive for:  Clarinet
Voice
False Positive for:  Clarinet
Sax
False Positive for:  Clarinet
Piano
False Positive for:  Clarinet
Voice
Clarinet TP: 20
Clarinet FP: 11
Clarinet TN: 249
Clarinet FN: 4
 
False Positive for:  Flute
Violin
False Positive for:  Flute
Sax
False Positive for:  Flute
Cello
Flute TP: 20
Flute FP: 3
Flute TN: 257
Flute FN: 4
 
False Positive for:  Acoustic Guitar
Sax
False Positive for:  Acoustic Guitar
Cello
False Positive 

In [140]:
# Upload test case audio
test_audio = "bensound-clapandyell.mp3"
case_study_snd, sr = librosa.load(test_audio)

In [141]:
# initialise array for storing output audio
audio = []

# having some issues with that there sample rate
# sr = 22050

estimates = UMXpredict.separate(
    torch.as_tensor(case_study_snd).float(),
    rate = sr,
    device = device
)
for target, estimates in estimates.items():
    print(target)
    output = estimates.detach().cpu().numpy()[0]
    audio.append(output)
    display(Audio(output, rate=sr*2))

In [53]:
# calculate 3 second window
three_seconds = 3 * sr

# initialise array for storing case study spectrograms
case_study_spectrograms = []

# compute mel spec parameters
win_len = int(0.05 * sr)
hop_len = int(0.025 * sr)
n_mels = 64

# loop through each audio output from open-unmix
for n, output in enumerate(audio):
    
    # iniitalise variabel for counting number of spectrograms per output
    spec_counter = 0
    
    # convert output to mono audio
    output_mono = librosa.to_mono(output)
    
    for i in range(len(snd)):
        
        # cut audio at three second windows
        if (i % three_seconds) == 0:

            # compute mel spectrogram from raw audio
            spec = mel_spec(output_mono[i:i+three_seconds], sr=sr, n_mels=n_mels, hop_length=hop_len, 
                                                  win_length=win_len, window='hann')

            # if spectrogram is smaller than network input, skip
            if(spec.shape[1] < 121):
                continue
            
            # rescale output range to be logarithmic
            # corresponds to human perception
            spec = tfio.audio.dbscale(spec, top_db=80)

            # augment spectrogram in terms of frequency
            augmented_spec = tfio.audio.freq_mask(spec, param=10)

            # convert to original spectrogram to tensor
            # add new axis to account for channels
            spec = spec[..., tf.newaxis]
            spec = tf.convert_to_tensor(spec)
            # reshape to tensor shape
            spec = tf.reshape(spec, (64, 121, 1))
           
            case_study_spectrograms.append(spec)
            spec_counter += 1
        
    # count number of spectrograms per output
    print(n, ":", spec_counter)
    
print("Total no of spectrograms: ", len(case_study_spectrograms))

0 : 58
1 : 58
2 : 58
3 : 58
Total no of spectrograms:  232


In [54]:
# get largest value from across spectrograms
case_study_max = np.amax(case_study_spectrograms)

# normalise all specs according to maximum value
normalised_case_study = np.abs(case_study_spectrograms // case_study_max)
# if value in spectrogram is NAN, replace with 0
normalised_case_study = np.where(np.isnan(normalised_case_study), 0, normalised_case_study)

63.44747


In [55]:
# use model to get label predictions
case_study_predictions = model.predict(normalised_case_study)

# initialise array for storing classifications
case_study_classifications = []

# loop through all spectrograms and get classification
for k in range(len(case_study_predictions)):
    case_label_no = np.argmax(predictions[k])
    # output word classification to array
    case_study_classifications.append(label_names[case_label_no])

In [58]:
# split classifications into groups of input source
voice_class = case_study_classifications[:58]
drums_class = case_study_classifications[58:116]
bass_class = case_study_classifications[116:174]
other_class = case_study_classifications[174:]

# count classifications of each instrument per output
unique_voice = np.unique(voice_class, return_counts=True)
max_value_location = np.argmax(unique_voice[1])
final_voice_classification = unique_voice[0][max_value_location]

unique_drums = np.unique(drums_class, return_counts=True)
max_value_location_d = np.argmax(unique_drums[1])
final_drums_classification = unique_drums[0][max_value_location_d]

unique_bass = np.unique(bass_class, return_counts=True)
max_value_location_b = np.argmax(unique_bass[1])
final_bass_classification = unique_bass[0][max_value_location_b]

unique_other = np.unique(other_class, return_counts=True)
max_value_location_o = np.argmax(unique_other[1])
final_other_classification = unique_other[0][max_value_location_o]

# print final output classification
print(final_voice_classification)
print(final_drums_classification)
print(final_bass_classification)
print(final_other_classification)

Flute
Clarinet
Sax
Trumpet
