# Anomaly Detection with Autoencoders

In [None]:
import librosa 
from scipy.io import wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import librosa

### Extract an MFCC for each cough audio file in the dataset and store it in a Panda Dataframe along with it's class label 

In [None]:
def extract_mfccs(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        mfccsnorm = np.mean(mfccs.T,axis=0)
        
    except Exception as e:
        print("file could not be loaded: ", file_name)
        return None 
     
    return mfccsnorm

In [None]:
# Set the path to the barks and not_barks datasets
bark_folder = 'barks_10s'
not_bark_folder = 'not_barks'
bark_path = os.path.join(os.getcwd(), bark_folder)
not_bark_path = os.path.join(os.getcwd(), not_bark_folder)

csv_file = 'barks.csv'
metadata = pd.read_csv(os.path.join(os.getcwd(),csv_file))

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    class_label = row["class_name"]
    
    if class_label == 'bark':
        file_name = os.path.join(os.path.abspath(bark_path),str(row["file_name"]))
    else:
        file_name = os.path.join(os.path.abspath(not_bark_path),str(row["file_name"]))
    
    data = extract_mfccs(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdataframe = pd.DataFrame(features, columns=['feature','class_label'])

print('features extracted from ', len(featuresdataframe), ' files') 

In [None]:
print(featuresdataframe)

In [None]:
barks_dataframe = featuresdataframe[featuresdataframe['class_label'] == 'bark']
not_barks_dataframe = featuresdataframe[featuresdataframe['class_label'] == 'not_bark']

### Convert the categorical text data into model-understandable numerical data. 

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and Class labels into numpy arrays
X = np.array(barks_dataframe.feature.tolist())
y = np.array(barks_dataframe.class_label.tolist())

# Encode the classification labels
labelen = LabelEncoder()
classen = to_categorical(labelen.fit_transform(y)) 

### Split the dataset into training and testing sets. The testing set size will be 10% and we will set a random state. 


In [None]:
# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, classen, test_size=0.1, random_state = 42)

In [None]:
print(f"train count: {len(x_train)}")

In [None]:
input_dim = x_train.shape[1]
print(input_dim)

In [None]:
print(X.shape[1])

### Create the Autoencoder Model with L1 Sparsity Penalty as 1e-5 and Loss function as MSE

In [None]:
from sklearn import metrics
import numpy as np
import pandas as pd
from IPython.display import display, HTML 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.regularizers import l1

model = Sequential()
model.add(Dense(512, input_dim=X.shape[1], activity_regularizer=l1(0.00001), activation='relu'))
model.add(Dense(224, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(X.shape[1])) # Multiple output neurons

model.summary()

In [None]:
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [None]:
history=model.fit(x_train,x_train,verbose=1,epochs=90)

In [None]:
plt.plot(history.history['loss'], label='train loss')
#plt.plot(history.history['accuracy'], label='Accuracy')
plt.legend()
plt.show()

### Calculate the MSE for the Test Dataset (out of sample) and on the Whole Dataset (In + Out Sample)

In [106]:
def compute_acc(obs, preds):
    count = 0

    for prediction,observation in zip(preds,obs):
        loss = np.sqrt(metrics.mean_squared_error(prediction,observation))
        if loss < 1: count += 1 

    accuracy = count / len(x_test)
    
    return accuracy

In [108]:
pred = model.predict(x_test)
score1 = np.sqrt(metrics.mean_squared_error(pred,x_test))
accuracy1 = compute_acc(x_test,pred)
pred = model.predict(X)
score2 = np.sqrt(metrics.mean_squared_error(pred,X))
accuracy2 = compute_acc(X,pred)

print(f"Out of Sample Score (RMSE): {score1}")
print(f"Out of Sample accuracy: {accuracy1}")
print(f"Insample Normal Score (RMSE): {score2}")
print(f"Insample accuracy): {accuracy2}")

Out of Sample Score (RMSE): 0.935054361820221
Out of Sample accuracy: 0.6666666666666666
Insample Normal Score (RMSE): 0.9366835355758667
Insample accuracy): 6.583333333333333


### Use the model to predict the MSE for validation set 

In [None]:
#print(not_barks_dataframe.feature[0:2])
#print(np.array(not_barks_dataframe.feature[0:2])) 
#print(np.array(not_barks_dataframe.feature.tolist()[0:2]))

In [107]:
pred = model.predict(X_not_barks)
print(compute_acc(X_not_barks, pred))

0.0


In [93]:
X_not_barks = np.array(not_barks_dataframe.feature.tolist())
pred = model.predict(X_not_barks)
score3 = np.sqrt(metrics.mean_squared_error(pred,X_not_barks))
print(f"Validation sample (RMSE): {score3}")

(428, 40)
Validation sample (RMSE): 11.18336009979248


### Save the Model

In [None]:
model.save('C:\\Users\\\\Documents\\Cough Detection\\Data\\autoencoder_model_4.h5')