In [1]:
import numpy as np
import pandas as pd

import IPython.display as ipd

import librosa
import librosa.display

import matplotlib.pyplot as plt

import cv2
%matplotlib inline
import numpy as np
import matplotlib
# prikaz vecih slika 
matplotlib.rcParams['figure.figsize'] = 16,12

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn import metrics

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, Conv3D, BatchNormalization, MaxPool2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import to_categorical

from keras.callbacks import ModelCheckpoint
from datetime import datetime

Using TensorFlow backend.


In [2]:
def extract_spectogram(file_name):
    try:
        audio, sample_rate = librosa.load(file_name,res_type="kaiser_fast" ,sr=22050)
        a, index = librosa.effects.trim(audio, top_db=30, frame_length=2048, hop_length=512)
        y_out = a[:44100]
        spectrogram = librosa.feature.melspectrogram(y=y_out, sr=sample_rate, n_fft=2048, hop_length=1024)
        #print(spectrogram)
        spec_shape = spectrogram.shape
        if(spec_shape[1] < 44):
            print(spec_shape)
            print(file_name)
        spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
        #imarray = np.reshape(spectrogram, (128, -1, 3))
        #img_gray = np.dot(spectrogram[...,:3], [0.299, 0.587, 0.114])
        #img_resized = cv2.resize(img_gray, (128,128), interpolation=cv2.INTER_CUBIC)
        #print(img_resized.shape)
        #redgram = block_reduce(img_gray, block_size = (3,3), func = np.mean)
        #return img_gray
        #mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        #mfccscaled = np.mean(mfccs.T, axis=0)
        return spectrogram
    
    except Exception as e:
        print(e)
        print("Error encountered while parsing file: ", file_name)
        return None
    
    return mfccscaled    

def my_rgb2gray(img_rgb):
    img_gray = np.ndarray((img_rgb.shape[0], img_rgb.shape[1]))  # zauzimanje memorije za sliku (nema trece dimenzije)
    img_gray = 0.21*img_rgb[:, :] + 0.72*img_rgb[:, :] #+ 0.07*img_rgb[:, :, 2]
#     img_gray = img_gray.astype('uint8')  # u prethodnom koraku smo mnozili sa float, pa sada moramo da vratimo u [0,255] opseg
    return img_gray

def normalize_gray(array):
    return (array - array.min())/(array.max() - array.min())

In [3]:
folder_path = "dataset/"
folder_list = ["Accordion","Clarinet_Bb","Contrabass","Horn","Viola","Violin","Violoncello"]
features = []
onlyfiles = []

from os import listdir
from os.path import isfile, join

for folder_inst in folder_list:
    onlyfiles = onlyfiles + [f for f in listdir("dataset/" + folder_inst) if isfile(join("dataset/" + folder_inst + "/", f))]
    


for file in onlyfiles: 
    dodatak = ''
    if (file.split("-")[0] == "Acc"):
        dodatak = "Accordion"
    elif (file.split("-")[0] == "ClBb"):
        dodatak = "Clarinet_Bb"
    elif (file.split("-")[0] == "Cb"):
        dodatak = "Contrabass"
    elif (file.split("-")[0] == "Hn"):
        dodatak = "Horn"
    elif (file.split("-")[0] == "Va"):
        dodatak = "Viola"
    elif (file.split("-")[0] == "Vn"):
        dodatak = "Violin"
    elif (file.split("-")[0] == "Vc"):
        dodatak = "Violoncello"
    
    file_name = folder_path + dodatak + "/" + file
    a = file.split("-")
    instrument = a[0]
    pitch = a[2]
    if(len(pitch) == 3):
        pitch = pitch[:2]
    else:
        pitch = pitch[:1]
    data = extract_spectogram(file_name)
    features.append([file_name, data, instrument, pitch])
    #print(img)
    

In [4]:
#menjanje boje slike
features_gray = []
for list in features:
    imggray = my_rgb2gray(list[1])
    features_gray.append([list[0],normalize_gray(imggray), list[2], list[3]])
    
features_df = pd.DataFrame(features_gray, columns=["file", "feature", "instrument", "pitch"])
features_df.head()

Unnamed: 0,file,feature,instrument,pitch
0,dataset/Accordion/Acc-ord-A#3-ff-alt1-N.wav,"[[0.29073387, 0.33191147, 0.2776536, 0.2996274...",Acc,A#
1,dataset/Accordion/Acc-ord-A#3-ff-alt2-N.wav,"[[0.3070023, 0.3617015, 0.27213687, 0.36119562...",Acc,A#
2,dataset/Accordion/Acc-ord-A#3-ff-N-N.wav,"[[0.47353348, 0.42426324, 0.3613015, 0.4068834...",Acc,A#
3,dataset/Accordion/Acc-ord-A#3-mf-alt1-N.wav,"[[0.42782307, 0.46233648, 0.38736117, 0.385382...",Acc,A#
4,dataset/Accordion/Acc-ord-A#3-mf-alt2-N.wav,"[[0.6518973, 0.62134737, 0.5658642, 0.48233718...",Acc,A#


In [5]:
X = np.array(features_df.feature.tolist())
yinst = np.array(features_df.instrument.tolist())

leinst = LabelEncoder()
yyinst = leinst.fit_transform(yinst)

x_train, x_test, y_train, y_test = train_test_split(X, yyinst, test_size=0.2, shuffle=True, random_state=42)
x_validation = x_train.reshape(len(x_train),128,44,1)
print(x_validation.shape)

x_test = x_test.reshape(len(x_test),128,44,1)
print(x_test.shape)

num_classes = len(leinst.classes_)
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

(1161, 128, 44, 1)
(291, 128, 44, 1)


In [6]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(128,44,1 )))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
#Compile
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 126, 42, 32)       320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 21, 32)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 21, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 42336)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               5419136   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 7)                

In [7]:
num_epochs = 60
batch_size = 32

checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=1, save_best_only=True)

start = datetime.now()
model.fit(x_validation, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), 
           callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in: ", duration)

Train on 1161 samples, validate on 291 samples
Epoch 1/60

Epoch 00001: val_loss improved from inf to 1.29042, saving model to best_weights.hdf5
Epoch 2/60

Epoch 00002: val_loss improved from 1.29042 to 0.65364, saving model to best_weights.hdf5
Epoch 3/60

Epoch 00003: val_loss improved from 0.65364 to 0.37752, saving model to best_weights.hdf5
Epoch 4/60

Epoch 00004: val_loss improved from 0.37752 to 0.19722, saving model to best_weights.hdf5
Epoch 5/60

Epoch 00005: val_loss improved from 0.19722 to 0.16623, saving model to best_weights.hdf5
Epoch 6/60

Epoch 00006: val_loss improved from 0.16623 to 0.14418, saving model to best_weights.hdf5
Epoch 7/60

Epoch 00007: val_loss improved from 0.14418 to 0.10063, saving model to best_weights.hdf5
Epoch 8/60

Epoch 00008: val_loss improved from 0.10063 to 0.07017, saving model to best_weights.hdf5
Epoch 9/60

Epoch 00009: val_loss did not improve from 0.07017
Epoch 10/60

Epoch 00010: val_loss did not improve from 0.07017
Epoch 11/60

E

Epoch 39/60

Epoch 00039: val_loss improved from 0.00490 to 0.00477, saving model to best_weights.hdf5
Epoch 40/60

Epoch 00040: val_loss did not improve from 0.00477
Epoch 41/60

Epoch 00041: val_loss improved from 0.00477 to 0.00299, saving model to best_weights.hdf5
Epoch 42/60

Epoch 00042: val_loss did not improve from 0.00299
Epoch 43/60

Epoch 00043: val_loss did not improve from 0.00299
Epoch 44/60

Epoch 00044: val_loss improved from 0.00299 to 0.00221, saving model to best_weights.hdf5
Epoch 45/60

Epoch 00045: val_loss did not improve from 0.00221
Epoch 46/60

Epoch 00046: val_loss did not improve from 0.00221
Epoch 47/60

Epoch 00047: val_loss did not improve from 0.00221
Epoch 48/60

Epoch 00048: val_loss did not improve from 0.00221
Epoch 49/60

Epoch 00049: val_loss did not improve from 0.00221
Epoch 50/60

Epoch 00050: val_loss did not improve from 0.00221
Epoch 51/60

Epoch 00051: val_loss did not improve from 0.00221
Epoch 52/60

Epoch 00052: val_loss did not improve 

In [8]:
train_score = model.evaluate(x_train, y_train, verbose=1)
print("Training Accuracy: ", train_score[1])

ValueError: Error when checking input: expected conv2d_1_input to have 4 dimensions, but got array with shape (1161, 128, 44)