In [1]:
import glob
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
import time
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

In [2]:
EPSILON = 10e-10
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    #chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    raw_mel = librosa.feature.melspectrogram(X, sr=sample_rate).T
    mel = np.mean(raw_mel, axis=0)
    processed_mel = raw_mel + EPSILON
    logMel = np.mean(np.log(processed_mel),axis=0)
    #contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    #tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,logMel, mel

def parse_audio_files(parent_dir,sub_dirs,file_ext='*.wav'):
    features, labels, namePath = np.empty((0,296)), np.empty(0), np.empty(0)
    for label, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            mfccs, logmel, mel = extract_feature(fn)
            ext_features = np.hstack([mfccs,logmel, mel])
            features = np.vstack([features,ext_features])
            labels = np.append(labels, fn.split('/')[2].split('-')[1])
            namePath = np.append(namePath, fn)
    return np.array(features), np.array(labels, dtype = np.int), np.array(namePath, dtype=np.string_)

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

def one_hot_decode(labels):
    n_labels, n_unique_labels = np.shape(labels)
    one_hot_decode = np.zeros((n_labels))
    for idx in range(n_labels):
        for l in range(n_unique_labels):
            if(labels[idx, l] == 1):
                one_hot_decode[idx] = l
                break;
    return one_hot_decode

In [3]:
x = extract_feature("audio/fold1/7061-6-0-0.wav")
nfMfccs = np.shape(x[0])[0]
nfLogMel = np.shape(x[1])[0]
nfMel = np.shape(x[2])[0]

In [4]:
print "Number of features"
print "* Mfcc: {0}".format(nfMfccs)
print "* logMel {0}".format(nfLogMel)
print "* Mel {0}".format(nfMel)

Number of features
* Mfcc: 40
* logMel 128
* Mel 128


In [5]:
features = np.load("nn_features_fold_full.npy", allow_pickle=True)
labels = np.load("nn_labels_fold_full.npy", allow_pickle=True)
file_names = np.load("nn_file_names_fold_full.npy", allow_pickle=True)

In [6]:
import tensorflow as tf
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

In [7]:
# Initialize parameters
training_epochs = 20000 # number of iteration
n_dim = 168 # number of features
n_classes = 10 # number of classes
n_hidden_units_one = 280  # number of units in hidden layer 1
n_hidden_units_two = 300 # number of units in hidden layer 2
sd = 1 / np.sqrt(n_dim) # std of initial weight and bias
learning_rate = 0.01 

# Randomly initialize params for each layers
X = tf.placeholder(tf.float32,[None,n_dim])
Y = tf.placeholder(tf.float32,[None,n_classes])

# Hidden layer 1 params
W_1 = tf.Variable(tf.random_normal([n_dim,n_hidden_units_one], mean = 0, stddev=sd))
b_1 = tf.Variable(tf.random_normal([n_hidden_units_one], mean = 0, stddev=sd))
h_1 = tf.nn.tanh(tf.matmul(X,W_1) + b_1)

# Hidden layer 2 params
W_2 = tf.Variable(tf.random_normal([n_hidden_units_one,n_hidden_units_two], mean = 0, stddev=sd))
b_2 = tf.Variable(tf.random_normal([n_hidden_units_two], mean = 0, stddev=sd))
h_2 = tf.nn.sigmoid(tf.matmul(h_1,W_2) + b_2)

# Softmax classification initialization
W = tf.Variable(tf.random_normal([n_hidden_units_two,n_classes], mean = 0, stddev=sd))
b = tf.Variable(tf.random_normal([n_classes], mean = 0, stddev=sd))
y_ = tf.nn.softmax(tf.matmul(h_2,W) + b)


init = tf.global_variables_initializer()

# cost_function = 
cost_function = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(y_), reduction_indices=[1])) 
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_function)

correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [8]:
import pandas as pd

In [11]:
test_label_df_2 = pd.read_csv("test_sound/test_csv.csv")
test_label_df_2 = test_label_df_2.iloc[:,1:3]
test_label_df_2.head()

Unnamed: 0,File name,Label
0,test_sound/engine_idling_1.wav,5.0
1,test_sound/carhorn_6.wav,1.0
2,test_sound/children_8.wav,2.0
3,test_sound/drill_17.wav,4.0
4,test_sound/aircon_19.wav,0.0


In [12]:
def parse_audio_test_files(parent_dir,sub_dir="",file_ext='*.wav'):
    features, namePath = np.empty((0,296)), np.empty(0)
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        mfccs, logmel, mel = extract_feature(fn)
        ext_features = np.hstack([mfccs,logmel, mel])
        features = np.vstack([features,ext_features])
        namePath = np.append(namePath, fn)
    return np.array(features), np.array(namePath, dtype=np.string_)
test_features, test_fileNames = parse_audio_test_files("test_sound")

In [13]:
test_label_df_2.shape

(180, 2)

In [14]:
np.shape(test_fileNames)

(180,)

In [16]:
test_label_df_2["Label"].unique()

array([ 5.,  1.,  2.,  4.,  0.,  7.,  8.,  6.,  3.])

In [17]:
def special_one_hot_encode(labels):
    n_labels = len(labels)
    one_hot_encode = np.zeros((n_labels,10))
    for idx in range(n_labels):
        for l in range(10):
            if(labels[idx] == l):
                one_hot_encode[idx,l] = 1
    return one_hot_encode

In [18]:
test_label_np = test_label_df_2.as_matrix(["Label"])
test_x = test_features[:,0:168]
test_y = special_one_hot_encode(test_label_np[:,0])
np.shape(test_y)

(180, 10)

In [32]:
p

array([ 0.41666667,  0.61538462,  0.71428571,  0.52380952,  0.3       ,
        0.26315789,  1.        ,  0.27777778,  0.90909091,  0.        ])

In [33]:
r

array([ 0.25,  0.4 ,  0.25,  0.55,  0.45,  0.25,  0.05,  0.5 ,  0.5 ,  0.  ])

In [31]:
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess,"model/nn_model_full.ckpt")
    print "Test"
    y_pred = sess.run(tf.argmax(y_,1),feed_dict={X: test_x})
    y_true = sess.run(tf.argmax(test_y,1))
    p,r,f,s = precision_recall_fscore_support(y_true, y_pred)
    print "F-Score: {0}".format(f)
    print "Confusion Matrix"
    print confusion_matrix(y_true, y_pred)

Test
F-Score: [ 0.3125      0.48484848  0.37037037  0.53658537  0.36        0.25641026
  0.0952381   0.35714286  0.64516129  0.        ]
Confusion Matrix
[[ 5  0  0  0  0  2  0  3  0 10]
 [ 1  8  0  8  2  1  0  0  0  0]
 [ 2  0  5  0  4  3  0  2  0  4]
 [ 3  0  0 11  0  0  0  0  0  6]
 [ 0  0  0  1  9  2  0  7  0  1]
 [ 0  0  1  0  4  5  0  5  0  5]
 [ 0  0  0  1  3  4  1  7  1  3]
 [ 0  4  1  0  5  0  0 10  0  0]
 [ 1  1  0  0  3  2  0  2 10  1]
 [ 0  0  0  0  0  0  0  0  0  0]]


In [25]:
classID = {
    0: "AirCon",
    1: "Car horn",
    2: "Children playing",
    3: "Dog bark",
    4: "Drilling",
    5: "Engine idling",
    6: "Gun shot",
    7: "Jackhammer",
    8: "Siren",
    9: "Street music"
}

In [28]:
for ID, name in classID.items():
    x = y_pred[(y_true == ID)*(y_pred != ID)]
    names = test_fileNames[(y_true == ID)*(y_pred != ID)]
    
    unique_class, unique_count = np.unique(x, return_counts=True)
    print "************************************************"
    print " Class {0} is mistaken with: ".format(name)
    for c, y in zip(unique_class, unique_count):        
        print "# {0}: {1} samples".format(classID[c], y)
        for name in names[x == c]:
            true_name = name.split("/")[1]
            print "--- {0}".format(true_name)
print "************************************************"

************************************************
 Class AirCon is mistaken with: 
# Engine idling: 2 samples
--- aircon_12.wav
--- aircon_13.wav
# Jackhammer: 3 samples
--- aircon_19.wav
--- aircon_17.wav
--- aircon_18.wav
# Street music: 10 samples
--- aircon_4.wav
--- aircon_20.wav
--- aircon_1.wav
--- aircon_11.wav
--- aircon_8.wav
--- aircon_3.wav
--- aircon_16.wav
--- aircon_14.wav
--- aircon_15.wav
--- aircon_2.wav
************************************************
 Class Car horn is mistaken with: 
# AirCon: 1 samples
--- carhorn_5.wav
# Dog bark: 8 samples
--- carhorn_16.wav
--- carhorn_12.wav
--- carhorn_1.wav
--- carhorn_4.wav
--- carhorn_14.wav
--- carhorn_15.wav
--- carhorn_3.wav
--- carhorn_13.wav
# Drilling: 2 samples
--- carhorn_8.wav
--- carhorn_20.wav
# Engine idling: 1 samples
--- carhorn_17.wav
************************************************
 Class Children playing is mistaken with: 
# AirCon: 2 samples
--- children_17.wav
--- children_2.wav
# Drilling: 4 samples
--- 

In [9]:
test_label_df = pd.read_csv("data/clean_testset/label_testset.csv")
test_label_df = test_label_df.iloc[:,1:3]
test_label_df.head()

Unnamed: 0,File name,Label
0,data\clean_testset\104817__longshot__drilling-...,4
1,data\clean_testset\104817__longshot__drilling-...,4
2,data\clean_testset\104817__longshot__drilling-...,4
3,data\clean_testset\104817__longshot__drilling-...,4
4,data\clean_testset\104817__longshot__drilling-...,4


In [11]:
def parse_audio_test_files(parent_dir,sub_dir="",file_ext='*.wav'):
    features, namePath = np.empty((0,296)), np.empty(0)
    for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
        mfccs, logmel, mel = extract_feature(fn)
        ext_features = np.hstack([mfccs,logmel, mel])
        features = np.vstack([features,ext_features])
        namePath = np.append(namePath, fn)
    return np.array(features), np.array(namePath, dtype=np.string_)

In [12]:
test_features, test_fileNames = parse_audio_test_files("data","clean_testset")

In [13]:
test_label_df.shape

(430, 2)

In [14]:
np.shape(test_fileNames)

(430,)

In [15]:
test_label_df["Label"].unique()

array([4, 7, 6, 9, 8, 0, 2, 3])

In [16]:
def special_one_hot_encode(labels):
    n_labels = len(labels)
    one_hot_encode = np.zeros((n_labels,10))
    for idx in range(n_labels):
        for l in range(10):
            if(labels[idx] == l):
                one_hot_encode[idx,l] = 1
    return one_hot_encode

In [17]:
test_label_np = test_label_df.as_matrix(["Label"])
test_x = test_features[:,0:168]
test_y = special_one_hot_encode(test_label_np[:,0])
np.shape(test_y)

(430, 10)

In [18]:
test_file_names = test_label_df.as_matrix(["File name"])

In [19]:
y_true

NameError: name 'y_true' is not defined

In [71]:
y_pred

array([7, 4, 4, 4, 4, 2, 4, 3, 3, 3, 6, 8, 3, 4, 9, 9, 4, 6, 2, 7, 6, 7, 3,
       4, 3, 9, 7, 9, 0, 7, 4, 3, 9, 4, 3, 3, 5, 8, 2, 7, 2, 4, 2, 5, 7, 4,
       2, 4, 8, 4, 3, 2, 9, 4, 4, 7, 8, 3, 4, 4, 4, 8, 6, 2, 7, 7, 4, 4, 4,
       3])

In [20]:
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess,"model/nn_model_full.ckpt")
    print "Test"
    y_pred = sess.run(tf.argmax(y_,1),feed_dict={X: test_x})
    y_true = sess.run(tf.argmax(test_y,1))
    p,r,f,s = precision_recall_fscore_support(y_true, y_pred, average='micro')
    print "F-Score: {0}".format(f)
    print "Confusion Matrix"
    print confusion_matrix(y_true, y_pred)

Test
F-Score: 0.220930232558
Confusion Matrix
[[ 0  1  1  0  0  0  1  0  0]
 [ 0  0  3  5  0  0  4  1  0]
 [ 0  0  0  1  0  0  8  1  1]
 [ 0  4  8 22  2  1 26  3  1]
 [ 0  0  0  0  0  0  0  0  0]
 [ 0  2  1  5  0  0  3  0  0]
 [ 0  6 20 50  2  1 57 13  4]
 [ 2 11 27 46  3  3 53 15  4]
 [ 0  1  0  4  0  0  2  0  1]]


In [21]:
classID = {
    0: "AirCon",
    1: "Car horn",
    2: "Children playing",
    3: "Dog bark",
    4: "Drilling",
    5: "Engine idling",
    6: "Gun shot",
    7: "Jackhammer",
    8: "Siren",
    9: "Street music"
}

In [29]:
for ID, name in classID.items():
    x = y_pred[(y_true == ID)*(y_pred != ID)]
    names = test_file_names[(y_true == ID)*(y_pred != ID)][:,0]
    
    unique_class, unique_count = np.unique(x, return_counts=True)
    print "************************************************"
    print " Class {0} is mistaken with: ".format(name)
    for c, y in zip(unique_class, unique_count):        
        print "# {0}: {1} samples".format(classID[c], y)
        for name in names[x == c]:
            #true_name = name.split("/")[2]
            print name
            #print "--- {0}".format(true_name)
print "************************************************"

************************************************
 Class AirCon is mistaken with: 
# Children playing: 1 samples
data\clean_testset\Background Noise-SoundBible.com-190168996_1.wav
# Dog bark: 1 samples
data\clean_testset\Turn On Air Conditioning-SoundBible.com-472411363_1.wav
# Jackhammer: 1 samples
data\clean_testset\Background Noise-SoundBible.com-190168996_2.wav
************************************************
 Class Car horn is mistaken with: 
************************************************
 Class Children playing is mistaken with: 
# Dog bark: 3 samples
data\clean_testset\Children Playing-SoundBible.com-591301488_4.wav
data\clean_testset\Children Playing-SoundBible.com-591301488_5.wav
data\clean_testset\Children Playing-SoundBible.com-591301488_8.wav
# Drilling: 5 samples
data\clean_testset\Children Playing-SoundBible.com-591301488_2.wav
data\clean_testset\Children Playing-SoundBible.com-591301488_9.wav
data\clean_testset\Children Playing-SoundBible.com-591301488_11.wav
data\clean

In [50]:
test_file_names[(y_true == ID)*(y_pred != ID)]

array([['test_sound/Background Noise-SoundBible.com-190168996.wav'],
       ['test_sound/Turn On Air Conditioning-SoundBible.com-472411363.wav']], dtype=object)

In [65]:
np.shape(test_file_names[(y_true == ID)*(y_pred != ID)])

(2, 1)

In [66]:
test_file_names[(y_true == ID)*(y_pred != ID)][:,0]

array(['test_sound/Background Noise-SoundBible.com-190168996.wav',
       'test_sound/Turn On Air Conditioning-SoundBible.com-472411363.wav'], dtype=object)