In [None]:
import os
import pathlib
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import models
import wave
from scipy.io.wavfile import read
from sklearn.model_selection import train_test_split
import librosa
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential
import sounddevice as sd
from scipy.io.wavfile import write
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import  Conv1D,  GRU, Dropout, Input, Dense, Conv2D, Reshape, MaxPooling2D
import random
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from scipy import linalg
random.seed(111)
fs = 16000
Keyword_path = "../keyword_sound"
Noise_path =  "../background_sound"

Val_path_pos = "../validation_data/positives"
Val_path_neg = "../validation_data/negatives"

In [None]:
def spec_augment(spec: np.ndarray, num_mask=2, 
                 freq_masking_max_percentage=0.10, time_masking_max_percentage=0.2):

    spec = spec.copy()
    for i in range(num_mask):
        all_frames_num, all_freqs_num = spec.shape
        freq_percentage = random.uniform(0.0, freq_masking_max_percentage)
        
        num_freqs_to_mask = int(freq_percentage * all_freqs_num)
        f0 = np.random.uniform(low=0.0, high=all_freqs_num - num_freqs_to_mask)
        f0 = int(f0)
        spec[:, f0:f0 + num_freqs_to_mask] = 0

        time_percentage = random.uniform(0.0, time_masking_max_percentage)
        
        num_frames_to_mask = int(time_percentage * all_frames_num)
        t0 = np.random.uniform(low=0.0, high=all_frames_num - num_frames_to_mask)
        t0 = int(t0)
        spec[t0:t0 + num_frames_to_mask, :] = 0
    
    return spec


In [None]:
keyword_data = []
noise_data = []
test_pos = []
test_neg = []
val_pos = []
val_neg = []
sample_rate = 16000
keyword_data_aug = []
noise_data_aug = []

ebin_augment = []
ebin = []

keyword_audio = []
noise_audio = []

for keyword in os.listdir(Keyword_path):
    audio, fs = librosa.load(os.path.join(Keyword_path, keyword), sr=16000)
    keyword_audio.append(audio)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    keyword_data.append(mel_spec_db_norm) 
    ebin.append(mel_spec_db)
    mel_spec_db = librosa.power_to_db(spec_augment(mel_spec), ref=np.max).T
    ebin_augment.append(mel_spec_db)
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    keyword_data_aug.append(mel_spec_db_norm) 


    
for keyword in os.listdir(Val_path_pos):
    audio, fs = librosa.load(os.path.join(Val_path_pos, keyword), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    val_pos.append(mel_spec_db_norm) 
    
    
    

for noise in os.listdir(Noise_path):
    audio, fs = librosa.load(os.path.join(Noise_path, noise), sr=16000)
    noise_audio.append(audio)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    noise_data.append(mel_spec_db_norm)
    
    mel_spec_db = librosa.power_to_db(spec_augment(mel_spec), ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    noise_data_aug.append(mel_spec_db_norm) 
 


    
for noise in os.listdir(Val_path_neg):
    audio, fs = librosa.load(os.path.join(Val_path_neg, noise), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    val_neg.append(mel_spec_db_norm) 

In [None]:
def time_shift(audio): 
    return np.roll(audio,int(sample_rate/(random.randint(2,10)))) #randint means that roll is done with freqs 1600 and 3200



for audio_first in keyword_audio[::10]:
    for audio_second in keyword_audio[::10]: 
        audio_shifted = np.add(audio_first, time_shift(audio_second)*random.uniform(0.6, 0.9))
        
        mel_spec = librosa.feature.melspectrogram(y=audio_shifted, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
        mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
        keyword_data.append(mel_spec_db_norm) 
        
for audio_first in noise_audio[::10]:
    for audio_second in noise_audio[::10]: 
        audio_shifted = np.add(audio_first, time_shift(audio_second)*random.uniform(0.6, 0.9))
        
        mel_spec = librosa.feature.melspectrogram(y=audio_shifted, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
        mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
        noise_data.append(mel_spec_db_norm)
        

keyword_data = keyword_data + keyword_data_aug
noise_data = noise_data + noise_data_aug

In [None]:
import librosa.display

fig, ax = plt.subplots()
x = ["","0", "1", "2", "3"]
print(keyword_data[17].shape)
img = librosa.display.specshow(keyword_data[17], x_axis='time',
y_axis='mel', sr=16000, fmax=8000, ax=ax)

fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set_xticklabels(x)
ax.set(title='Augmented Mel-frequency Spectrogram')
ax.set_xlabel("Time (s)")
ax.set_ylabel("Frequency (Hz)")
plt.savefig('augmented_mel_spec.png')
#Let us take a look at one of the true positive and false positive samples Female voice test set. 

fig, ax = plt.subplots()
x = ["","0", "1", "2", "3"]

img = librosa.display.specshow(noise_data[0].T, x_axis='time',
y_axis='mel', sr=16000, fmax=8000, ax=ax)

fig.colorbar(img, ax=ax, format='%+2.0f dB')
ax.set_xticklabels(x)
ax.set(title='Mel-frequency Spectrogram')
ax.set_xlabel("Time (s)")
ax.set_ylabel("Frequency (Hz)")
plt.savefig('mel_spec.png')

In [None]:
print(np.shape(keyword_data))
print(np.shape(noise_data))
print(np.shape(val_pos))
print(np.shape(val_neg))

keyword_data = np.squeeze(keyword_data)
noise_data = np.squeeze(noise_data)
val_pos = np.squeeze(val_pos)
val_neg = np.squeeze(val_neg)

keyword_labels = np.ones(keyword_data.shape[0])
val_pos_labels = np.ones(val_pos.shape[0])

noise_labels = 0*np.ones(noise_data.shape[0])
val_neg_labels = 0*np.ones(val_neg.shape[0])

X_train = np.concatenate((keyword_data, noise_data), axis=0)
y_train  = np.concatenate((keyword_labels, noise_labels), axis=0)

X_val = np.concatenate((val_pos, val_neg), axis=0)
y_val = np.concatenate((val_pos_labels, val_neg_labels), axis=0)

y_train = tf.keras.utils.to_categorical(y_train, 2)
y_val = tf.keras.utils.to_categorical(y_val, 2)

In [None]:
print(keyword_data.shape[1])
print(keyword_data.shape[2])

In [None]:
from keras.backend import manual_variable_initialization 
manual_variable_initialization(True)

num_units = 48
dropout_ratio = 0.3
checkpoint_filepath = '../keyword_model_1D_CRNN'
checkpoint_dir = os.path.dirname(checkpoint_filepath)

my_callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=120, mode='min'),
                tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=False, 
                monitor='val_loss', mode='min', save_best_only=True),
]

model = Sequential([
    Input(shape=(keyword_data.shape[1], keyword_data.shape[2])),
    Conv1D(num_units,3,padding="same",activation='relu',name='layer1'),
    Dropout(dropout_ratio),
    Conv1D(num_units,3,padding="same",activation='relu',name='layer2'),
    Dropout(dropout_ratio),
    GRU(num_units,name='RNN_1',return_sequences=False),
    Dense(2, activation='softmax', name='dense_a'),
])

model.compile(loss='BinaryCrossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=2000, batch_size=32, callbacks=my_callbacks, 
                    validation_data=(X_val, y_val))

In [None]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
model = tf.keras.Model()
#model.load_model(latest)
model=tf.keras.models.load_model('../keyword_model_1D_CRNN')

model.evaluate(X_val, y_val, batch_size=32)

In [None]:
val_loss = history.history['val_loss']
train_loss = history.history['loss']
plt.plot(train_loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:

Test_path_pos = "../test_data/positives/test_female_positives"
Test_path_neg = "../test_data/negatives/test_female_negatives"
test_pos = []
test_neg = []

for keyword in os.listdir(Test_path_pos):
    audio, fs = librosa.load(os.path.join(Test_path_pos, keyword), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    test_pos.append(mel_spec_db_norm) 


for noise in os.listdir(Test_path_neg):
    audio, fs = librosa.load(os.path.join(Test_path_neg, noise), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    test_neg.append(mel_spec_db_norm) 
   

In [None]:
print(np.shape(test_pos))
print(np.shape(test_neg))
test_pos = np.squeeze(test_pos)
test_neg = np.squeeze(test_neg)
test_pos_labels = np.ones(test_pos.shape[0])
test_neg_labels = 0*np.ones(test_neg.shape[0])

X_test = np.concatenate((test_pos, test_neg), axis=0)
y_test = np.concatenate((test_pos_labels, test_neg_labels), axis=0)
y_test = tf.keras.utils.to_categorical(y_test, 2)

In [None]:

model.evaluate(X_test, y_test, batch_size=32)
hypothesis= model.predict(X_test, batch_size=32)
y_pred = tf.keras.utils.to_categorical(hypothesis, 2)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

y_true = y_test[:, 1]
#fpr, tpr, thresholds = precision_recall_curve(y_true, y_pred)

y_pred = np.round(hypothesis[:, 1])
CM = confusion_matrix(y_true, y_pred)
TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_score = 2*((Precision*Recall)/(Precision+Recall))
print(f"true negatives: {TN}, false negatives: {FN}, true positives {TP}, false positives {FP}")
print(f" Precision: {Precision}")
print(f" Recall: {Recall}")
print(f" F1 score: {F1_score}")

In [None]:
Test_path_pos = "../test_data/positives/test_male_positives"
Test_path_neg = "../test_data/negatives/test_male_negatives"
test_pos = []
test_neg = []
for keyword in os.listdir(Test_path_pos):
    audio, fs = librosa.load(os.path.join(Test_path_pos, keyword), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    test_pos.append(mel_spec_db_norm) 

for noise in os.listdir(Test_path_neg):
    audio, fs = librosa.load(os.path.join(Test_path_neg, noise), sr=16000)
    
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_fft=512, hop_length=160, n_mels=48, fmax=8000)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).T
    mel_spec_db_norm = (mel_spec_db - np.min(mel_spec_db)) / (np.max(mel_spec_db) - np.min(mel_spec_db))
    test_neg.append(mel_spec_db_norm) 

In [None]:
print(np.shape(test_pos))
print(np.shape(test_neg))
test_pos = np.squeeze(test_pos)
test_neg = np.squeeze(test_neg)
test_pos_labels = np.ones(test_pos.shape[0])
test_neg_labels = 0*np.ones(test_neg.shape[0])

X_test = np.concatenate((test_pos, test_neg), axis=0)
y_test = np.concatenate((test_pos_labels, test_neg_labels), axis=0)
y_test = tf.keras.utils.to_categorical(y_test, 2)

In [None]:
model.evaluate(X_test, y_test, batch_size=32)
hypothesis= model.predict(X_test, batch_size=32)
y_pred = tf.keras.utils.to_categorical(hypothesis, 2)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

y_true = y_test[:, 1]

#fpr, tpr, thresholds = precision_recall_curve(y_true, y_pred)


y_pred = np.round(hypothesis[:, 1])
CM = confusion_matrix(y_true, y_pred)
TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]
Precision = TP/(TP+FP)
Recall = TP/(TP+FN)
F1_score = 2*((Precision*Recall)/(Precision+Recall))
print(f"true negatives: {TN}, false negatives: {FN}, true positives {TP}, false positives {FP}")
print(f" Precision: {Precision}")
print(f" Recall: {Recall}")
print(f" F1 score: {F1_score}")