In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# NOTE: librosa dependencies apparently require specific versions of numpy, try numpy==1.21.4
import librosa
import librosa.display

import IPython.display

In [3]:
eq_df = pd.read_csv('./large_data/filt_equilizer_data.csv')
display(eq_df)

Unnamed: 0,class,eq_0,eq_20,eq_40,eq_80,eq_160,eq_300,eq_600,eq_1200,eq_2400,eq_5000,eq_10000,eq_20000,crestfactor,fold
0,dog_bark,0.313727,0.509609,15.972056,23.806945,19.013657,9.896539,5.242837,3.614684,3.614684,3.614684,3.614684,3.614684,9.334427,5
1,children_playing,6.910740,7.460572,7.091304,5.694018,3.854400,2.640367,1.891866,1.850414,1.541261,0.823915,0.417778,0.379089,11.940308,5
2,children_playing,6.398027,6.052477,5.684627,4.941867,3.432045,2.278605,1.887060,1.422939,0.894123,0.473454,0.240222,0.218051,8.454538,5
3,children_playing,5.157348,5.563031,5.564920,6.222735,4.337063,4.436010,3.365371,2.909730,2.174156,1.172847,0.608560,0.552146,9.255638,5
4,children_playing,6.440980,7.740646,7.131680,5.932448,4.267275,2.873736,1.954626,1.828848,1.255049,0.665879,0.338348,0.307133,6.102388,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8727,car_horn,2.972869,3.801715,5.173664,5.096503,5.008788,3.656169,3.386407,2.462293,1.602434,0.850675,0.441381,0.400527,7.153451,7
8728,car_horn,2.059369,2.637125,2.212017,1.901393,2.249080,1.947132,1.428574,4.025899,2.130836,1.181452,1.001693,1.001693,7.917514,7
8729,car_horn,2.434537,2.865060,4.436447,5.714574,6.115657,6.417108,5.769391,5.151178,4.586839,2.883944,1.625564,1.625564,15.879704,7
8730,car_horn,2.968306,5.328083,5.457131,4.693414,4.343679,4.973550,4.037891,3.220036,1.704732,0.900501,0.646166,0.646166,7.518558,7


After initial attempts performed relatively poorly, I decided to try to add another classifier object. Crest factor is the ratio of the maximum amplitude of a signal to its root mean square. As such, I expect short, loud sounds like gunshots to exhibit large crest factors.

After initial attempts with the crest factor I noticed that a number of the audio samples have significant "room noise" present. In an attempt to clean this up I'm taking the hilbert transform (extracts the instantaneous amplitude of a signal). Smoothing this transform and dividing by the root mean square power should then amplify the parts of the signal that are large in amplitude, while minimizing the areas that are simply a constant amplitude "hum". Low crest factor systems will be largely unaffected as the root mean square will be similar to the root mean square of the signal for such signals. Finally we ensure that the maximum amplitude of the signal is scaled to be equal to the input signal.

In [None]:
# add crest factor to the data frame
Crfac = np.zeros((len(eq_df),1))
for i in range(len(eq_df)):
    y,sr = load_data(metadata.loc[i])
    k = np.fft.fftfreq(len(y), d=dt)
    hil = np.abs(scipy.signal.hilbert(y))
    hilk = np.fft.fft(hil)
    hilk *= np.exp(-k*k / (2*10**2))
    hil2 = np.fft.ifft(hilk)
    hil2 = np.abs(hil2)
    hil2 *= hil.max() / hil2.max()
    filt_y = y * hil2/np.sqrt((y*y).mean())
    filt_y *= y.max() / filt_y.max()
    y_sq = filt_y*filt_y
    Cr = filt_y.max() / np.sqrt(y_sq.mean())
    Crfac[i] = Cr

eq_df['crestfactor'] = Crfac

In [None]:
eq_df2 = eq_df.copy()
eq_df2.replace({'air_conditioner':0, 'car_horn':1, 'children_playing':2, 'dog_bark':3, 'drilling':4,
                'engine_idling':5, 'gun_shot':6, 'jackhammer':7, 'siren':8, 'street_music':9},inplace=True)
# eq_df2.replace({'air_conditioner':0, 'car_horn':0, 'children_playing':0, 'dog_bark':0, 'drilling':0,
#                 'engine_idling':0, 'gun_shot':1, 'jackhammer':0, 'siren':0, 'street_music':0},inplace=True)

Documentation of the dataset suggests not shuffling the dataset. This is because there are a number of audio files that are taken as sections from longer audio files and will result in anomalous results if these are shuffled together. Instead the dataset has included a psuedorandom "fold" category to serve as splits for cross validation. 

In [None]:
dropfold = 1

In [None]:
eq_df3 = eq_df2.drop(eq_df2[eq_df2['fold']==dropfold].index)

In [None]:
eq_df3[eq_df3['fold']==dropfold].values

In [None]:
eq_df3.drop(columns='fold',inplace=True)

In [None]:
eq_df3.iloc[1,1:].values

In [None]:
X_train = eq_df3.iloc[:,1:].values
# X_train = [eq_df3.iloc[:,1:-2].values, eq_df3.iloc[:,-1]]
X_train

In [None]:
y_train = eq_df3.iloc[:,0].values
y_train

In [None]:
X_val = eq_df2[eq_df2['fold'] == dropfold].copy()
X_val.drop(columns='fold',inplace=True)
X_val = X_val.iloc[:,1:].values
y_val = eq_df2[eq_df2['fold'] == dropfold]
y_val = y_val.iloc[:,0].values

In [None]:
X_val

In [None]:
y_val

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(150,150,150,150,150,), max_iter=50000)

In [None]:
mlp.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Here we consider the True Positive Rate for the gunshot data as an indicator of the goodness of fit. Interestingly this is often higher when the classifier can classify into multiple categories than simply gunshot/not gunshot.

In [None]:
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
# print("Validation accuracy of", acc)
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[1,1] / (np.sum(confmat[:,1])+np.sum(confmat[1,:]) - confmat[1,1])
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[1,1] / (np.sum(confmat[:,1])+np.sum(confmat[1,:]) - confmat[1,1])
# print("Validation accuracy of", acc)
# confmat = confusion_matrix(y_train, mlp.predict(X_train))
# acc = 100*confmat[1,1] / (np.sum(np.sum(confmat[1,:])))
# print("Training accuracy of", acc)
# confmat = confusion_matrix(y_val, mlp.predict(X_val))
# acc = 100*confmat[1,1] / (np.sum(confmat[1,:]))
confmat = confusion_matrix(y_train, mlp.predict(X_train))
acc = 100*confmat[6,6] / (np.sum(np.sum(confmat[6,:])))
print("Training accuracy of", acc)
confmat = confusion_matrix(y_val, mlp.predict(X_val))
acc = 100*confmat[6,6] / (np.sum(confmat[6,:]))
print("Validation accuracy of", acc)

In [None]:
pd.DataFrame(confusion_matrix(y_val, mlp.predict(X_val)),
            columns=["predicted "+str(i) for i in range(10)],
            index=["actual "+str(i) for i in range(10)])
# pd.DataFrame(confusion_matrix(y_val, mlp.predict(X_val)),
#             columns=["predicted "+str(i) for i in range(2)],
#             index=["actual "+str(i) for i in range(2)])

As suggested by the dataset the most accurate results for a classifier are acheived when averaged over the different possible test/validation splits.

In [None]:
accuracy_vec = np.zeros(10)
for dropfold in range(1,11):
    eq_df2 = eq_df.copy()
    eq_df2.replace({'air_conditioner':0, 'car_horn':1, 'children_playing':2, 'dog_bark':3, 'drilling':4,
                'engine_idling':5, 'gun_shot':6, 'jackhammer':7, 'siren':8, 'street_music':9},inplace=True)
#     eq_df2.replace({'air_conditioner':0, 'car_horn':0, 'children_playing':0, 'dog_bark':0, 'drilling':0,
#                     'engine_idling':0, 'gun_shot':1, 'jackhammer':0, 'siren':0, 'street_music':0},inplace=True)

    eq_df3 = eq_df2.drop(eq_df2[eq_df2['fold']==dropfold].index)
    eq_df3.drop(columns='fold',inplace=True)
    X_train = eq_df3.iloc[:,1:].values
    y_train = eq_df3.iloc[:,0].values
    
#     X_val = eq_df2[eq_df2['fold'] == dropfold]
#     X_val = X_val.iloc[:,1:].values
#     y_val = eq_df2[eq_df2['fold'] == dropfold]
#     y_val = y_val.iloc[:,0].values
    
    X_val = eq_df2[eq_df2['fold'] == dropfold].copy()
    X_val.drop(columns='fold',inplace=True)
    X_val = X_val.iloc[:,1:].values
    y_val = eq_df2[eq_df2['fold'] == dropfold]
    y_val = y_val.iloc[:,0].values
    
    mlp = MLPClassifier(hidden_layer_sizes=(150,150,150,150,150,), max_iter=50000)
    mlp.fit(X_train, y_train)
    confmat = confusion_matrix(y_val, mlp.predict(X_val))
#     acc = 100*confmat[6,6] / (np.sum(confmat[:,6])+np.sum(confmat[6,:]) - confmat[6,6])
#     accuracy_vec[dropfold] = np.round(100*accuracy_score(y_train, mlp.predict(X_train)),2)\
#     acc = 100*confmat[1,1] / np.sum(confmat[1,:])
    acc= 100*confmat[6,6] / np.sum(confmat[6,:])
    accuracy_vec[dropfold-1] = acc
    
print(accuracy_vec)
accuracy_vec.mean()