In [2]:
# CNN Algorithm Creation
# MLP Accuracy Obtained Using Nodes: 128, Epochs: 100
# Training: 95.988%, Testing: 90.66%

In [3]:
# Import libraries
import librosa
import numpy as np
import os

In [36]:
# Test MFCC values of Longer Vs. Shorter Samples
# Creating a function that extracts the MFCC features of an audio file
def extract_features(file_name, max_pad_len):
    
    try:
        
        # Librosa extraction of audio array and sampling rate
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') # resampling at a "faster rate as opposed to higher quality"
        # MFCC feature extraction of audio - mfccs is mfcc sequence (array), n_mfcc is number of MFCCs to return
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        # If the number of frames is less than the max_pad_len, zero-pad up to max_pad_len
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file ", file_name)
        return None
    
    return mfccs

In [39]:
# Load various imports 
import pandas as pd
import os
import librosa

# Set the path to the full UrbanSound dataset
cwd = os.getcwd()
fulldatasetpath = cwd + '/UrbanSound8K/audio/'
metadata = pd.read_csv(cwd + "//UrbanSound8K//metadata//UrbanSound8K.csv")
categories = ['dog_bark', 'car_horn', 'gun_shot', 'siren']

lenVars = []

# Iterate through each sound file and extract the number of frames 
for index, row in metadata.iterrows():
    
    # Extract filename and category
    category_str = row["class_name"]
    
    # Loop through metadata comparing the categories
    if category_str in categories:
        # Extract MFCCs 
        file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs_pre = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        numFrames = mfccs_pre.shape[1]
        lenVars.append(numFrames)
    else:
        continue

# Extract max number of frames
max_pad_length = max(lenVars)

features = []

# Iterate through each sound file and extract the features 
for index, row in metadata.iterrows():
    
    # Extract filename and category
    category_str = row["class_name"]
    
    # Loop through metadata comparing the categories
    if category_str in categories:
    
        file_name = os.path.join(os.path.abspath(fulldatasetpath),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
        class_label = row["class_name"]
        data = extract_features(file_name, max_pad_length)
        features.append([data, class_label])
        
    else:
        continue
        
# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(featuresdf), ' files')

173
Finished feature extraction from  2732  files


In [43]:
# Normalize the dataframe such that there are the same number of files per class_label
# This ensures that no one category has an advantage when the model is being trained

print(featuresdf.class_label.count()) # 2732

# Create dictionary of dataframes
frames = {}
categories = ['dog_bark', 'car_horn', 'gun_shot', 'siren']

arr_Size = []

for label in categories:
    frames[label] = featuresdf[featuresdf['class_label'] == label]
    # Extract shape and get number of rows
    rNc = frames[label].shape
    # Gets number of rows
    arr_Size.append(rNc[0])
    print(label, rNc[0])

# Take the minimum size from size array
minSize = min(arr_Size)

# Utilize minimum size to slice rows such that only the minimum size is maintained
for label in frames:
    frames[label] = frames[label].sample(minSize)
    print(frames[label].shape[0])
    
# Concatenate all dataframes in dictionary of dataframes
# Place the concatenated frame back in featuresdf
# Reindex
result = pd.concat(frames)
#display(type(result))
features_temp = pd.DataFrame()
features_temp = result[["feature", "class_label"]]

# Reindex features_temp
features_temp = features_temp.reset_index(drop=True)
display(features_temp)

2732
dog_bark 1000
car_horn 429
gun_shot 374
siren 929
374
374
374
374


Unnamed: 0,feature,class_label
0,"[[-533.82697, -535.1126, -538.5872, -528.69556...",dog_bark
1,"[[-341.76727, -365.21857, -405.96603, -433.467...",dog_bark
2,"[[-332.11945, -336.3666, -349.95135, -349.5728...",dog_bark
3,"[[-314.78796, -303.8232, -299.5003, -298.3605,...",dog_bark
4,"[[-377.4245, -378.74524, -378.1277, -381.29782...",dog_bark
...,...,...
1491,"[[-299.80365, -298.18814, -302.90424, -301.992...",siren
1492,"[[-330.24762, -336.24966, -349.237, -346.9751,...",siren
1493,"[[-79.5879, -86.098335, -102.86485, -108.29289...",siren
1494,"[[-66.55484, -84.70029, -109.27356, -96.35527,...",siren


In [44]:
# Send temp features to features df
featuresdf = features_temp
display(featuresdf)

Unnamed: 0,feature,class_label
0,"[[-533.82697, -535.1126, -538.5872, -528.69556...",dog_bark
1,"[[-341.76727, -365.21857, -405.96603, -433.467...",dog_bark
2,"[[-332.11945, -336.3666, -349.95135, -349.5728...",dog_bark
3,"[[-314.78796, -303.8232, -299.5003, -298.3605,...",dog_bark
4,"[[-377.4245, -378.74524, -378.1277, -381.29782...",dog_bark
...,...,...
1491,"[[-299.80365, -298.18814, -302.90424, -301.992...",siren
1492,"[[-330.24762, -336.24966, -349.237, -346.9751,...",siren
1493,"[[-79.5879, -86.098335, -102.86485, -108.29289...",siren
1494,"[[-66.55484, -84.70029, -109.27356, -96.35527,...",siren


In [45]:
# Use sklearn.preprocessing.LabelEncoder to encode the categorical text data into model-understandable numerical data

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())

#This part will convert the categories into their respective numerical value
le = LabelEncoder()
# Fit transform receives categories and assigns numerical value to them. to_categorical converts to binary matrix
yy = to_categorical(le.fit_transform(y))

# split the dataset - 10% test, 90% train
from sklearn.model_selection import train_test_split 

# X is feature, Y is labels
# 42 is the seed to generating random numbers - starting position, integer required to ensure training and testing are consistent
x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.1, random_state = 42)

In [46]:
# Store data into next notebook
%store x_train
%store x_test
%store y_test
%store y_train
%store yy
%store le
%store max_pad_length

Stored 'x_train' (ndarray)
Stored 'x_test' (ndarray)
Stored 'y_test' (ndarray)
Stored 'y_train' (ndarray)
Stored 'yy' (ndarray)
Stored 'le' (LabelEncoder)
Stored 'max_pad_length' (int)
