In [5]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# NOTE: librosa dependencies apparently require specific versions of numpy, try numpy==1.21.4
import librosa
import librosa.display

import IPython.display

In [19]:
eq_df = pd.read_csv('./large_data/eq_with_harmony2.csv')
display(eq_df)

Unnamed: 0,class,eq_0,eq_10,eq_20,eq_30,eq_40,eq_60,eq_80,eq_120,eq_160,...,eq_10000,eq_15000,eq_20000,crestfactor,salience,harmonic_power,percussive_power,harmonic_hits,percussive_hits,fold
0,dog_bark,0.042073,0.037768,0.037040,0.040564,0.103124,0.126494,0.166781,0.199152,0.419491,...,7.060339,5.324175,4.841554,7.006769,1.0,0.000005,2.825963e-03,5.0,1.0,5.0
1,children_playing,7.185887,6.910740,7.921071,7.460572,7.698635,7.091304,6.637689,5.694018,4.534828,...,0.556369,0.417778,0.379089,11.940308,1.0,0.000006,2.081879e-05,11.0,0.0,5.0
2,children_playing,6.597605,6.398027,6.016003,6.052477,5.801635,5.684627,5.734884,4.941867,4.015981,...,0.319594,0.240222,0.218051,8.454538,1.0,0.000016,3.140698e-07,6.0,2.0,5.0
3,children_playing,5.929216,5.157348,5.165380,5.563031,5.046141,5.564920,5.776320,6.222735,5.007891,...,0.808725,0.608560,0.552146,9.255638,1.0,0.000011,8.845374e-05,11.0,4.0,5.0
4,children_playing,8.538324,6.440980,6.393698,7.740646,7.195990,7.131680,6.614982,5.932448,4.940455,...,0.450104,0.338348,0.307133,6.102388,1.0,0.000028,5.712090e-07,16.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8728,car_horn,2.228170,2.972869,3.430412,3.801715,5.101242,5.173664,5.687871,5.096503,5.210139,...,0.587284,0.441381,0.400527,7.153451,2.0,0.000120,3.828546e-07,12.0,0.0,7.0
8729,car_horn,2.685533,2.139769,2.083099,2.181417,2.495794,2.707547,2.518987,2.219531,2.064120,...,1.508850,1.136151,1.032083,7.721215,2.0,0.001297,5.428997e-07,4.0,1.0,7.0
8730,car_horn,3.132223,2.452053,2.546355,2.729121,3.563305,4.425730,4.677102,4.777865,7.052980,...,2.370569,1.788467,1.622412,15.893921,2.0,0.000040,3.741536e-06,14.0,3.0,7.0
8731,car_horn,1.599208,1.839636,3.051158,3.095292,5.155525,5.937903,5.650943,5.174344,4.879232,...,0.955482,0.718008,0.651918,7.465270,2.0,0.000432,3.024667e-07,17.0,1.0,7.0


After initial attempts performed relatively poorly, I decided to try to add another classifier object. Crest factor is the ratio of the maximum amplitude of a signal to its root mean square. As such, I expect short, loud sounds like gunshots to exhibit large crest factors.

After initial attempts with the crest factor I noticed that a number of the audio samples have significant "room noise" present. In an attempt to clean this up I'm taking the hilbert transform (extracts the instantaneous amplitude of a signal). Smoothing this transform and dividing by the root mean square power should then amplify the parts of the signal that are large in amplitude, while minimizing the areas that are simply a constant amplitude "hum". Low crest factor systems will be largely unaffected as the root mean square will be similar to the root mean square of the signal for such signals. Finally we ensure that the maximum amplitude of the signal is scaled to be equal to the input signal.

In [20]:
nansvec = np.isnan(eq_df['crestfactor'])
nansvec[nansvec==True]

402    True
417    True
Name: crestfactor, dtype: bool

In [25]:
eq_df = eq_df.dropna()

In [26]:
nansvec = np.isnan(eq_df['crestfactor'])
nansvec[nansvec==True]

Series([], Name: crestfactor, dtype: bool)

In [27]:
# try classifying with the log instead?
for i in range(0,len(eq_df)):
    eq_df.iloc[i,1:-7] = np.log10(eq_df.iloc[i,1:-7].values.astype(float))

In [28]:
eq_df2 = eq_df.copy()
eq_df2.replace({'air_conditioner':0, 'car_horn':1, 'children_playing':2, 'dog_bark':3, 'drilling':4,
                'engine_idling':5, 'gun_shot':6, 'jackhammer':7, 'siren':8, 'street_music':9},inplace=True)
# eq_df2.replace({'air_conditioner':0, 'car_horn':0, 'children_playing':0, 'dog_bark':0, 'drilling':0,
#                 'engine_idling':0, 'gun_shot':1, 'jackhammer':0, 'siren':0, 'street_music':0},inplace=True)

Documentation of the dataset suggests not shuffling the dataset. This is because there are a number of audio files that are taken as sections from longer audio files and will result in anomalous results if these are shuffled together. Instead the dataset has included a psuedorandom "fold" category to serve as splits for cross validation. 

In [29]:
dropfold = 1

In [30]:
eq_df3 = eq_df2.drop(eq_df2[eq_df2['fold']==dropfold].index)

In [31]:
eq_df3[eq_df3['fold']==dropfold].values

array([], shape=(0, 31), dtype=float64)

In [32]:
eq_df3.drop(columns='fold',inplace=True)

In [33]:
eq_df3.iloc[1,1:].values

array([ 8.56480394e-01,  8.39524552e-01,  8.98783931e-01,  8.72772106e-01,
        8.86413715e-01,  8.50726106e-01,  8.22016906e-01,  7.55418850e-01,
        6.56560810e-01,  5.85956768e-01,  4.96065809e-01,  4.21664324e-01,
        3.32610262e-01,  2.76890419e-01,  2.62928563e-01,  2.67268994e-01,
        2.56538704e-01,  1.87876224e-01,  2.71809221e-02, -8.41177638e-02,
       -2.54637387e-01, -3.79054138e-01, -4.21258627e-01,  1.19403085e+01,
        1.00000000e+00,  5.82140216e-06,  2.08187863e-05,  1.10000000e+01,
        0.00000000e+00])

In [34]:
X_train = eq_df3.iloc[:,1:].values
# X_train = [eq_df3.iloc[:,1:-2].values, eq_df3.iloc[:,-1]]
X_train

array([[-1.37599495e+00, -1.42287451e+00, -1.43132843e+00, ...,
         2.82596265e-03,  5.00000000e+00,  1.00000000e+00],
       [ 8.56480394e-01,  8.39524552e-01,  8.98783931e-01, ...,
         2.08187863e-05,  1.10000000e+01,  0.00000000e+00],
       [ 8.19386294e-01,  8.06046073e-01,  7.79308059e-01, ...,
         3.14069827e-07,  6.00000000e+00,  2.00000000e+00],
       ...,
       [ 4.95852622e-01,  3.89529936e-01,  4.05918896e-01, ...,
         3.74153624e-06,  1.40000000e+01,  3.00000000e+00],
       [ 2.03904895e-01,  2.64732008e-01,  4.84464720e-01, ...,
         3.02466653e-07,  1.70000000e+01,  1.00000000e+00],
       [ 1.81455386e-01,  3.51759398e-01,  5.34100032e-01, ...,
         1.99398517e-07,  1.00000000e+01,  1.00000000e+00]])

In [35]:
y_train = eq_df3.iloc[:,0].values
y_train

array([3, 2, 2, ..., 1, 1, 1])

In [36]:
X_val = eq_df2[eq_df2['fold'] == dropfold].copy()
X_val.drop(columns='fold',inplace=True)
X_val = X_val.iloc[:,1:].values
y_val = eq_df2[eq_df2['fold'] == dropfold]
y_val = y_val.iloc[:,0].values

In [37]:
X_val

array([[-1.00008089e+00, -1.11176166e+00, -9.80423083e-01, ...,
         6.51133602e-04,  1.00000000e+01,  2.00000000e+00],
       [-9.52418631e-01, -1.07985395e+00, -1.00313153e+00, ...,
         1.38349884e-04,  7.00000000e+00,  3.00000000e+00],
       [-4.17118162e-01, -4.24880154e-01, -4.34902521e-01, ...,
         7.26269327e-04,  3.00000000e+00,  2.00000000e+00],
       ...,
       [ 7.55280147e-01,  5.02799032e-01,  3.46976735e-01, ...,
         2.45415438e-06,  1.50000000e+01,  0.00000000e+00],
       [ 7.71719225e-01,  5.37098468e-01,  4.10464048e-01, ...,
         1.69385740e-06,  1.10000000e+01,  0.00000000e+00],
       [ 6.63271678e-01,  4.19493950e-01,  2.85181458e-01, ...,
         1.63094784e-06,  1.00000000e+00,  1.00000000e+00]])

In [38]:
y_val

array([3, 3, 3, 3, 6, 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 8, 8, 8, 8, 9, 9, 9,
       9, 9, 9, 9, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 3, 3, 3, 3, 3, 3, 3, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 3, 5, 5, 5, 5, 5, 5, 5, 5, 6, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3,
       0, 3, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9,
       9, 9, 9, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 4, 4,
       5, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 3, 2, 2, 2, 1, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,

In [39]:
from sklearn.neural_network import MLPClassifier

In [40]:
mlp = MLPClassifier(hidden_layer_sizes=(150,150,150,150,150,), max_iter=50000)

In [41]:
mlp.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(150, 150, 150, 150, 150), max_iter=50000)

In [42]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Here we consider the True Positive Rate for the gunshot data as an indicator of the goodness of fit. Interestingly this is often higher when the classifier can classify into multiple categories than simply gunshot/not gunshot.

In [43]:
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
# print("Validation accuracy of", acc)
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[1,1] / (np.sum(confmat[:,1])+np.sum(confmat[1,:]) - confmat[1,1])
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[1,1] / (np.sum(confmat[:,1])+np.sum(confmat[1,:]) - confmat[1,1])
# print("Validation accuracy of", acc)
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[1,1] / (np.sum(np.sum(confmat[1,:])))
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[1,1] / (np.sum(confmat[1,:]))
confmat = confusion_matrix(y_train, mlp.predict(X_train))
acc = 100*confmat[6,6] / (np.sum(np.sum(confmat[6,:])))
print("Training accuracy of", acc)
confmat = confusion_matrix(y_val, mlp.predict(X_val))
acc = 100*confmat[6,6] / (np.sum(confmat[6,:]))
print("Validation accuracy of", acc)

Training accuracy of 99.41002949852508
Validation accuracy of 74.28571428571429


In [44]:
pd.DataFrame(confusion_matrix(y_val, mlp.predict(X_val)),
            columns=["predicted "+str(i) for i in range(10)],
            index=["actual "+str(i) for i in range(10)])
# pd.DataFrame(confusion_matrix(y_val, mlp.predict(X_val)),
#             columns=["predicted "+str(i) for i in range(2)],
#             index=["actual "+str(i) for i in range(2)])

Unnamed: 0,predicted 0,predicted 1,predicted 2,predicted 3,predicted 4,predicted 5,predicted 6,predicted 7,predicted 8,predicted 9
actual 0,26,6,17,3,6,17,2,9,0,14
actual 1,1,13,6,2,8,1,0,0,4,1
actual 2,5,1,56,10,6,6,3,0,7,6
actual 3,2,2,16,68,1,1,1,0,3,6
actual 4,4,5,8,1,67,2,3,8,1,1
actual 5,11,0,2,1,3,41,0,8,17,13
actual 6,0,0,6,0,3,0,26,0,0,0
actual 7,0,0,0,0,56,17,1,46,0,0
actual 8,1,2,10,6,6,0,2,1,56,2
actual 9,4,5,14,0,16,6,2,7,2,44


As suggested by the dataset the most accurate results for a classifier are acheived when averaged over the different possible test/validation splits.

In [45]:
accuracy_vec = np.zeros(10)
for dropfold in range(1,11):
    eq_df2 = eq_df.copy()
    eq_df2.replace({'air_conditioner':0, 'car_horn':1, 'children_playing':2, 'dog_bark':3, 'drilling':4,
                'engine_idling':5, 'gun_shot':6, 'jackhammer':7, 'siren':8, 'street_music':9},inplace=True)
#     eq_df2.replace({'air_conditioner':0, 'car_horn':0, 'children_playing':0, 'dog_bark':0, 'drilling':0,
#                     'engine_idling':0, 'gun_shot':1, 'jackhammer':0, 'siren':0, 'street_music':0},inplace=True)

    eq_df3 = eq_df2.drop(eq_df2[eq_df2['fold']==dropfold].index)
    eq_df3.drop(columns='fold',inplace=True)
    X_train = eq_df3.iloc[:,1:].values
    y_train = eq_df3.iloc[:,0].values
    
#     X_val = eq_df2[eq_df2['fold'] == dropfold]
#     X_val = X_val.iloc[:,1:].values
#     y_val = eq_df2[eq_df2['fold'] == dropfold]
#     y_val = y_val.iloc[:,0].values
    
    X_val = eq_df2[eq_df2['fold'] == dropfold].copy()
    X_val.drop(columns='fold',inplace=True)
    X_val = X_val.iloc[:,1:].values
    y_val = eq_df2[eq_df2['fold'] == dropfold]
    y_val = y_val.iloc[:,0].values
    
    mlp = MLPClassifier(hidden_layer_sizes=(150,150,150,150,150,), max_iter=50000)
    mlp.fit(X_train, y_train)
    confmat = confusion_matrix(y_val, mlp.predict(X_val))
#     acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
#     accuracy_vec[dropfold] = np.round(100*accuracy_score(y_train, mlp.predict(X_train)),2)\
#     acc = 100*confmat[1,1] / np.sum(confmat[1,:])
    acc= 100*confmat[6,6] / np.sum(confmat[6,:])
    accuracy_vec[dropfold-1] = acc
    
print(accuracy_vec)
accuracy_vec.mean()

[74.28571429 82.85714286 86.11111111 65.78947368 95.         76.08695652
 68.62745098 73.33333333 67.74193548 56.25      ]


74.60831182575143