In [1]:
import os
import librosa
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
SR=22050
N_MFCC=40
N_MELS=128
DURATION=1
SAMPLES_PER_TRACK=SR*DURATION
CLASSES=['AR','Sniper','nogun']

In [3]:
dataset = pd.read_csv('dataset1.csv')

In [None]:
def extract_features(file_path):
    y,sr=librosa.load(file_path,sr=SR,duration=DURATION)
    if len(y)<SAMPLES_PER_TRACK:
        y=np.pad(y,(0,SAMPLES_PER_TRACK-len(y)))
    else:
        y=y[:SAMPLES_PER_TRACK]
    mfcc=librosa.feature.mfcc(y=y,sr=sr,n_mfcc=N_MFCC)
    mfcc=librosa.util.fix_length(mfcc,size=174,axis=1)
    mfcc=mfcc[...,np.newaxis]

    mel_spec=librosa.feature.melspectrogram(y=y,sr=sr)
    mel_spec=librosa.power_to_db(mel_spec,ref=np.max)
    mel_spec=librosa.util.fix_length(mel_spec,size=174,axis=1)
    mel_spec=mel_spec[...,np.newaxis]
    return mfcc,mel_spec

In [5]:
def load_dataset(dataset_path):
    x_mfcc,x_melspec,gun_type,direction,distance=[],[],[],[],[]
    suffixes=['_gain.wav','_pan.wav','_pitch.wav','_reverse.wav','_speed.wav','_noise.wav','_original.wav']
    for i,class_name in enumerate(CLASSES):
        class_dir=os.path.join(dataset_path,class_name)
        for file_name in os.listdir(class_dir):
            if file_name.endswith(".wav"):
                file_path=os.path.join(class_dir,file_name)
                mfcc,mel_spec=extract_features(file_path)
                x_mfcc.append(mfcc)
                x_melspec.append(mel_spec)
                if class_name=='Sniper':
                    for suffix in suffixes:
                        if file_name.endswith(suffix):
                            file_name=file_name[:-len(suffix)]+'.wav'
                gun_type.append(i)
                if class_name=='nogun':
                    direction.append('None')
                    distance.append('None')
                else:
                    row=dataset[dataset['name']==file_name]
                    direction.append(row['dire'].values[0])
                    distance.append(row['dist'].values[0])
    x_mfcc=np.array(x_mfcc)
    x_melspec=np.array(x_melspec)
    gun_encoder=LabelEncoder()
    gun_types_encoded=gun_encoder.fit_transform(gun_type)
    gun_types_categorical=to_categorical(gun_types_encoded)
    gun_types_categorical=np.array(gun_types_categorical)
    direction_encoder=LabelEncoder()
    direction_types_encoded=direction_encoder.fit_transform(direction)
    direction_types_categorical=to_categorical(direction_types_encoded)
    direction_types_categorical=np.array(direction_types_categorical)
    distance_encoder=LabelEncoder()
    distance_types_encoded=distance_encoder.fit_transform(distance)
    distance_types_categorical=to_categorical(distance_types_encoded)
    distance_types_categorical=np.array(distance_types_categorical)
    return x_mfcc,x_melspec,gun_types_categorical,direction_types_categorical,distance_types_categorical


In [6]:
X_mfcc,X_melspec,y_gun,y_direction,y_distance=load_dataset('gun_sound_v9')

In [7]:
X_melspec_shape=X_melspec.shape
X_melspec_shape

(2807, 128, 174, 1)

In [8]:
X_mfcc_shape=X_mfcc.shape
X_mfcc_shape

(2807, 40, 174, 1)

In [9]:
y_gun.shape

(2807, 3)

In [10]:
y_direction.shape

(2807, 6)

In [None]:
X_mfcc_trainval, X_mfcc_test, X_melspec_trainval, X_melspec_test, y_gun_trainval, y_gun_test,y_distance_trainval,y_distance_test,y_direction_trainval,y_direction_test = train_test_split(
    X_mfcc, X_melspec, y_gun,y_distance,y_direction,test_size=0.3, random_state=42
)

X_mfcc_train, X_mfcc_val, X_melspec_train, X_melspec_val, y_gun_train, y_gun_val,y_distance_train,y_distance_val,y_direction_train,y_direction_val = train_test_split(
    X_mfcc_trainval, X_melspec_trainval, y_gun_trainval,y_distance_trainval,y_direction_trainval, test_size=0.3, random_state=42
)

In [12]:
y_gun_train=np.array(y_gun_train)
y_gun_train.shape

(1374, 3)

In [13]:
X_mfcc_test.shape

(843, 40, 174, 1)

In [14]:
X_mfcc_val.shape

(590, 40, 174, 1)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D,MaxPooling2D,Dropout,GlobalAveragePooling2D,Dense,Input,GlobalAveragePooling1D,Reshape,Bidirectional,LSTM,Layer
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K

# class Attention(Layer):
#     def _init_(self, **kwargs):
#         super(Attention, self)._init_(**kwargs)

#     def build(self, input_shape):
#         self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1]),
#                                  initializer='glorot_uniform',
#                                  trainable=True)
#         self.b = self.add_weight(shape=(input_shape[-1],),
#                                  initializer='zeros',
#                                  trainable=True)
#         self.u = self.add_weight(shape=(input_shape[-1], 1),
#                                  initializer='glorot_uniform',
#                                  trainable=True)
#         super(Attention, self).build(input_shape)

#     def call(self, inputs, **kwargs):
#         # v: (batch_size, sequence_length, hidden_dim)
#         v = K.tanh(K.dot(inputs, self.W) + self.b)
        
#         # vu: (batch_size, sequence_length, 1) -> alignment scores
#         vu = K.dot(v, self.u)
        
#         # Apply softmax to compute attention weights
#         attention_weights = K.softmax(vu, axis=1)
        
#         # Compute context vector as a weighted sum of inputs
#         # weighted_sum: (batch_size, hidden_dim)
#         weighted_sum = K.sum(inputs * attention_weights, axis=1)
#         return weighted_sum

#     def compute_output_shape(self, input_shape):
#         return input_shape[0], input_shape[-1]
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer

class Attention(Layer):
    def _init_(self, **kwargs):
        super(Attention, self)._init_(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1]),
                                 initializer='glorot_uniform',
                                 trainable=True)
        self.b = self.add_weight(shape=(input_shape[-1],),
                                 initializer='zeros',
                                 trainable=True)
        self.u = self.add_weight(shape=(input_shape[-1], 1),
                                 initializer='glorot_uniform',
                                 trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, inputs, **kwargs):        
        v = K.tanh(K.dot(inputs, self.W) + self.b)
        vu = K.dot(v, self.u) 
        attention_weights = K.softmax(vu, axis=1) 
        context_vectors = inputs * attention_weights  
        return context_vectors 

    def compute_output_shape(self, input_shape):
        return input_shape 
    
def create_model(input_shape_mfcc,input_shape_melspec,num_classes=3,num_directions=6,num_distances=7):
    input_mfcc=Input(shape=input_shape_mfcc,name='mfcc_input')
    x_mfcc=Conv2D(16,kernel_size=(3,3),activation='relu')(input_mfcc)
    x_mfcc=MaxPooling2D(pool_size=(2,2))(x_mfcc)
    x_mfcc=Dropout(0.3)(x_mfcc)

    x_mfcc=Conv2D(32,kernel_size=(3,3),activation='relu')(x_mfcc)
    x_mfcc=MaxPooling2D(pool_size=(2,2))(x_mfcc)
    x_mfcc=Dropout(0.3)(x_mfcc)

    x_mfcc=Conv2D(64,kernel_size=(3,3),activation='relu')(x_mfcc)
    x_mfcc=MaxPooling2D(pool_size=(1,2))(x_mfcc)
    x_mfcc=Dropout(0.3)(x_mfcc)
    
    x_mfcc=Conv2D(128,kernel_size=(3,3),activation='relu')(x_mfcc)
    x_mfcc=MaxPooling2D(pool_size=(2,2))(x_mfcc)
    x_mfcc=Dropout(0.3)(x_mfcc)
    
    #x_mfcc=GlobalAveragePooling2D()(x_mfcc)
    #x_mfcc = Flatten()(x_mfcc)
    #x_mfcc = Reshape((x_mfcc.shape[1] * x_mfcc.shape[2], 128))(x_mfcc)
    x_mfcc = Reshape((x_mfcc.shape[-1],-1))(x_mfcc)
    x_mfcc=Bidirectional(LSTM(64,return_sequences=False))(x_mfcc)
    x_mfcc=Attention()(x_mfcc)


    input_melspec=Input(shape=input_shape_melspec,name='melspec_input')
    x_melspec=Conv2D(16,kernel_size=(3,3),activation='relu')(input_melspec)
    x_melspec=MaxPooling2D(pool_size=(2,2))(x_melspec)
    x_melspec=Dropout(0.3)(x_melspec)

    x_melspec=Conv2D(32,kernel_size=(3,3),activation='relu')(x_melspec)
    x_melspec=MaxPooling2D(pool_size=(2,2))(x_melspec)
    x_melspec=Dropout(0.3)(x_melspec)

    x_melspec=Conv2D(64,kernel_size=(3,3),activation='relu')(x_melspec)
    x_melspec=MaxPooling2D(pool_size=(2,2))(x_melspec)
    x_melspec=Dropout(0.3)(x_melspec)

    x_melspec=Conv2D(128,kernel_size=(3,3),activation='relu')(x_melspec)
    x_melspec=MaxPooling2D(pool_size=(2,2))(x_melspec)
    x_melspec=Dropout(0.3)(x_melspec)

    #x_melspec = GlobalAveragePooling2D()(x_melspec)
    #x_melspec =  Flatten()(x_melspec)
    #x_melspec = Reshape((x_melspec.shape[1] * x_melspec.shape[2], 40))(x_melspec)
    x_melspec = Reshape((x_melspec.shape[-1],-1))(x_melspec)
    x_melspec=Bidirectional(LSTM(64,return_sequences=False))(x_melspec)
    x_melspec=Attention()(x_melspec)
        
    concatenated=tf.keras.layers.concatenate([x_mfcc,x_melspec])
    common_dense=Dense(128,activation='relu')(concatenated)
    gunshot_output=Dense(num_classes,activation='softmax',name='gunshot_output')(common_dense)
    direction_output=Dense(num_directions,activation='softmax',name='direction_output')(common_dense)
    distance_output=Dense(num_distances,activation='softmax',name='distance_output')(common_dense)

    model=Model(inputs=[input_mfcc,input_melspec],outputs=[gunshot_output,direction_output,distance_output])
    
    return model

In [16]:
input_shape_mfcc=(40,174,1)
input_shape_melspec=(128,174,1)
model=create_model(input_shape_mfcc,input_shape_melspec,num_classes=3,num_directions=6,num_distances=7)

In [17]:
from tensorflow.keras.metrics import AUC
model.compile(optimizer='adam',
              loss={
                    'gunshot_output':'categorical_crossentropy',
                    'direction_output':'categorical_crossentropy',
                    'distance_output':'categorical_crossentropy'},
                metrics={
                    'gunshot_output': ['accuracy', 'Precision', 'Recall', AUC()],
                    'distance_output': ['accuracy', 'Precision', 'Recall', AUC()],
                    'direction_output': ['accuracy', 'Precision', 'Recall', AUC()]
                    })
model.summary()

In [18]:
y_direction_test.shape

(843, 6)

In [19]:
from keras.callbacks import ModelCheckpoint
batch_size=32
valid_batch_size=32
callbacks=[ModelCheckpoint("best_model.keras",monitor='val_loss')]
history=model.fit([X_mfcc_train,X_melspec_train],
                  {
                      'gunshot_output':y_gun_train,
                      'direction_output':y_direction_train,
                      'distance_output':y_distance_train
                  },
                  validation_data=([X_mfcc_val,X_melspec_val],
                                    {
                                        'gunshot_output':y_gun_val,
                                        'direction_output':y_direction_val,
                                        'distance_output':y_distance_val
                                    }),
                    epochs=150,batch_size=32,callbacks=[callbacks])


Epoch 1/150
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 418ms/step - direction_output_Precision: 0.0000e+00 - direction_output_Recall: 0.0000e+00 - direction_output_accuracy: 0.1923 - direction_output_auc_2: 0.5315 - distance_output_Precision: 0.0000e+00 - distance_output_Recall: 0.0000e+00 - distance_output_accuracy: 0.1919 - distance_output_auc_1: 0.5706 - gunshot_output_Precision: 0.5544 - gunshot_output_Recall: 0.2025 - gunshot_output_accuracy: 0.4756 - gunshot_output_auc: 0.6708 - loss: 4.7521 - val_direction_output_Precision: 0.0000e+00 - val_direction_output_Recall: 0.0000e+00 - val_direction_output_accuracy: 0.2475 - val_direction_output_auc_2: 0.6047 - val_distance_output_Precision: 0.0000e+00 - val_distance_output_Recall: 0.0000e+00 - val_distance_output_accuracy: 0.2254 - val_distance_output_auc_1: 0.6289 - val_gunshot_output_Precision: 0.6822 - val_gunshot_output_Recall: 0.6186 - val_gunshot_output_accuracy: 0.6593 - val_gunshot_output_auc: 0.7966 - val

In [20]:
import plotly.graph_objects as go
plt.clf()
fig = go.Figure()
fig.add_trace(go.Scatter(
                    y=history.history['gunshot_output_accuracy'],
                    name='Train'))
fig.add_trace(go.Scatter(
                    y=history.history['val_gunshot_output_accuracy'],
                    name='Valid'))
fig.update_layout(height=500, 
                  width=700,
                  title='Accuracy for gunshot feature',
                  xaxis_title='Epoch',
                  yaxis_title='Accuracy')
fig.show()

<Figure size 640x480 with 0 Axes>

In [21]:
plt.clf()
fig = go.Figure()
fig.add_trace(go.Scatter(
                    y=history.history['direction_output_accuracy'],
                    name='Train'))
fig.add_trace(go.Scatter(
                    y=history.history['val_direction_output_accuracy'],
                    name='Valid'))
fig.update_layout(height=500, 
                  width=700,
                  title='Accuracy for direction feature',
                  xaxis_title='Epoch',
                  yaxis_title='Accuracy')
fig.show()

<Figure size 640x480 with 0 Axes>

In [22]:
plt.clf()
fig = go.Figure()
fig.add_trace(go.Scatter(
                    y=history.history['distance_output_accuracy'],
                    name='Train'))
fig.add_trace(go.Scatter(
                    y=history.history['val_distance_output_accuracy'],
                    name='Valid'))
fig.update_layout(height=500, 
                  width=700,
                  title='Accuracy for distance feature',
                  xaxis_title='Epoch',
                  yaxis_title='Accuracy')
fig.show()

<Figure size 640x480 with 0 Axes>

In [23]:
# checkpoint=tf.keras.callbacks.ModelCheckpoint('best_model.keras',monitor='val_gunshot_output_accuracy',save_best_only=True,mode='max')
# history=model.fit([X_mfcc_train,X_melspec_train],
#                   {
#                       'gunshot_output':y_gun_train,
#                       'direction_output':y_direction_train,
#                       'distance_output':y_distance_train
#                   },
#                   validation_data=([X_mfcc_val,X_melspec_val],
#                                     {
#                                         'gunshot_output':y_gun_val,
#                                         'direction_output':y_direction_val,
#                                         'distance_output':y_distance_val
#                                     }),
#                     epochs=30,batch_size=32,callbacks=[checkpoint])
# model = create_model(input_shape_mfcc, input_shape_melspec)
# model=tf.keras.models.load_model('best_model.keras')
# model.load_weights('best_model.keras')
# results=model.evaluate([X_mfcc_test,X_melspec_test],{
#                                         'gunshot_output':y_gun_test,
#                                         'direction_output':y_direction_test,
#                                         'distance_output':y_distance_test
#                                     })
# test_loss=results[0]
# test_gun_accuracy=results[1]
# test_direction_accuracy=results[2]
# test_distance_accuracy=results[3]

# # print(f'Test Accuracy:{test_accuracy*100:.2f}%')
# # print(f'Test Loss:{test_loss*100:.2f}%')

In [24]:
test_results = model.evaluate([X_mfcc_test, X_melspec_test], 
                              [y_gun_test, y_direction_test, y_distance_test])

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 133ms/step - direction_output_Precision: 0.7577 - direction_output_Recall: 0.6747 - direction_output_accuracy: 0.7167 - direction_output_auc_2: 0.9535 - distance_output_Precision: 0.9208 - distance_output_Recall: 0.8870 - distance_output_accuracy: 0.8949 - distance_output_auc_1: 0.9848 - gunshot_output_Precision: 0.9362 - gunshot_output_Recall: 0.9348 - gunshot_output_accuracy: 0.9348 - gunshot_output_auc: 0.9877 - loss: 1.2717
