In [None]:
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import librosa
import librosa.display
import keras
from keras.layers import Activation, Dense, Dropout, Conv2D, Flatten, MaxPooling2D
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import GaussianNoise

from keras import optimizers
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import random
import matplotlib.pyplot as plt
from keras.layers.normalization import BatchNormalization
from sklearn.preprocessing import Normalizer
from keras.utils import to_categorical

In [None]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data exploration

In [None]:
# Label distribution
category_group=train.groupby(['label']).count()
print(type(category_group))
plot=category_group.unstack().plot(kind='bar', stacked=True, title="Number of Audio Samples per Category", figsize=(16,10))
plot.set_xlabel("Category")
plot.set_ylabel("Number of Samples");

In [None]:
import IPython.display as ipd  # To play sound in the notebook
fname1 = './wav/' + 'cc499e63eee4a3bcca48b5b452df04990df83570.wav'   # Hi-hat
ipd.Audio(fname1)

In [None]:
#Plotting audio frame
plt.figure(figsize=(16, 4))
plt.plot(data, '-', );

In [None]:
D = librosa.amplitude_to_db(np.abs(librosa.stft(wav)), ref=np.max)
librosa.display.specshow(D, y_axis='linear')

In [None]:
#extract training mfcc (20 features) using training dataframe and insert it into train
def extract_mfcc(paths):
    result = []
    for i in paths:
        y, sr = librosa.load('./wav/wav/{}'.format(i), sr = 22050)
        ps = librosa.feature.mfcc(y=y, sr= sr, n_mfcc=20)
        result.append((ps))
    return result

trainfiles = list(train["path"])
testfiles = list(test["path"])

train["mfcc"] = extract_mfcc(trainfiles)
test["mfcc"] = extract_mfcc(testfiles)

In [None]:
#add padding for uncommon shapes
def add_padding(mfccs):
    result = []
    for i in mfccs:
        if i.shape[1] != 44:
            i = librosa.util.fix_length(i, 44)
            result.append((i.T))
        else:
            result.append((i.T))
    return result

train["feature"] = add_padding(train["mfcc"])
train = train.drop("mfcc", axis = 1)

test["feature"] = add_padding(test["mfcc"])
test = test.drop("mfcc", axis = 1)

In [None]:

#split training data into training and validation
X_train, X_test, y_train, y_test = train_test_split(train["feature"], train["word"], test_size=0.2, random_state=288)

#reshape training and validation data into 44 by 20 by 1
X_train = np.array([x.reshape( (44,20,1)) for x in X_train])
X_test = np.array([x.reshape( (44,20,1)) for x in X_test])
X_predict = np.array([x.reshape( (44,20,1)) for x in test["feature"]])

#Change words to numerical value
le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

#Change numerical value of words to matrix representation
y_train = np.array(keras.utils.to_categorical(y_train,35))
y_test = np.array(keras.utils.to_categorical(y_test, 35))


In [None]:
#Specify CNN model 
model = Sequential()

input_shape = (44, 20, 1) #specify input shape

model.add(Conv2D(16, kernel_size=(3,3), activation="relu", input_shape = input_shape)) #apply a 32 filters Conv2D layer with a 3,3 kernel
# model.add(Dropout(0.2)) #apply a randomness dropout of .3 to avoid overfitting
model.add(BatchNormalization()) #use batchnormalization to normalize the layer input

model.add(Conv2D(32, kernel_size=(3,3), activation="relu", input_shape = input_shape)) #apply a 32 filters Conv2D layer with a 3,3 kernel
model.add(MaxPooling2D((2,2), strides=(2,2))) #apply a max pooling layer to avoid overfitting
model.add(Dropout(0.2)) #apply a randomness dropout of .3 to avoid overfitting
model.add(BatchNormalization()) #use batchnormalization to normalize the layer input

model.add(Conv2D(64, kernel_size=(3,3), activation="relu")) #apply a 64 filters Conv2D layer with a 3,3 kernel
model.add(Dropout(0.2)) #apply a randomness dropout of .3 to avoid overfitting
model.add(BatchNormalization()) #use batchnormalization to normalize the layer input


model.add(Conv2D(128, kernel_size=(5,5), activation="relu")) #apply a 64 filters Conv2D layer with a 3,3 kernel
model.add(MaxPooling2D((2,2), strides=(2,2))) #apply a max pooling layer to avoid overfitting
model.add(Dropout(0.2)) #apply a randomness dropout of .3 to avoid overfitting
model.add(BatchNormalization()) #use batchnormalization to normalize the layer input

model.add(GaussianNoise(0.1)) #apply a gaussian noise layer to account for noise

model.add(Flatten()) #apply a flatten layer

Dense(105, activation = "relu")
# model.add(Dropout(0.2))

model.add(Dense(35, activation="softmax")) #apply a dense layer of 35 to get the data in the right output shape



#Define the setup of the model by using Adam as optimizer, categorical crossentropy for analysing loss and accuracy for measuring performance
model.compile(
        optimizer = "Adam",
        loss = "categorical_crossentropy",
        metrics = ['accuracy'])

#Fitting the model on our data
history = model.fit(x = X_train, y = y_train, epochs = 20, batch_size = 100, validation_data=(X_test, y_test))

#Give a summary of our model
model.summary()

#Plot the accuracy for every epoch
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#Plot the loss for every epoch
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#Plot the score of our model
score = model.evaluate(x = X_test, y = y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
#Fit our model on the test data
y_predict = model.predict(X_predict)

#Decode our results to words
results = []
for i in y_predict:
    x = np.argmax(i) #get position of highest value
    results += [x]
    
results = le.inverse_transform(results) #get word for the numerical position

test["word"] = results #insert into test dataframe

test = test.drop("feature", axis = 1)
test.to_csv('test_results.csv')