In [25]:
#import all the libraries

import os
import numpy as np 
import pandas as pd
import librosa
import json
import pickle
from sklearn.model_selection import train_test_split
import random
import tensorflow as tf
import json
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras
import matplotlib.pyplot as plt

In [26]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Next, we load the csv file as a pandas dataframe. This csv file stores the information about the class labels and class ID's. The data is stored in 10 folders fold1 to fold10. We need to use the dataframe to access audio files for pre-processing.


In [27]:
df = pd.read_csv('/content/drive/MyDrive/urbansound/UrbanSound8K.csv')

df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [28]:
df[['class','classID']].value_counts()

class             classID
street_music      9          1000
jackhammer        7          1000
engine_idling     5          1000
drilling          4          1000
dog_bark          3          1000
children_playing  2          1000
air_conditioner   0          1000
siren             8           929
car_horn          1           429
gun_shot          6           374
dtype: int64

The classId column in the dataframe assigns numerical labels to each class. However, we want to assign one hot encoded labels. We can make use of np.eye() method to generate a matrix of one hot encoded labels


In [29]:
class_set = set(df['class'].tolist())       #converting the list to set gives us all the class labels only onceP

one_hot_labels = np.eye(len(class_set))

print(class_set)
print()
print(one_hot_labels)

{'car_horn', 'dog_bark', 'gun_shot', 'street_music', 'siren', 'air_conditioner', 'jackhammer', 'engine_idling', 'children_playing', 'drilling'}

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


Next, we map all the class labels to their corresponding one hot encoded label


In [30]:
mappings = {}

for key, value in zip(class_set, one_hot_labels):
    mappings[key] = value

In [31]:
mappings

{'air_conditioner': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'car_horn': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'children_playing': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]),
 'dog_bark': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'drilling': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'engine_idling': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'gun_shot': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'jackhammer': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'siren': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 'street_music': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])}

We define functions to extract frequency domain audio data features: Mfccs, stfts and melspectrograms


In [32]:
def mfccs_extractor(signal):
    
    NUM_MFCC = 13
    N_FFT = 2048
    HOP_LENGTH = 512
    SAMPLING_RATE = 22500
    

    mfcc = librosa.feature.mfcc(signal,sr=SAMPLING_RATE, n_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
    mfcc = mfcc.T
    
    return mfcc

def stft_extractor(signal):

    N_FFT = 2048
    HOP_LENGTH = 512
    SAMPLING_RATE = 22500

    stft = librosa.stft(signal, n_fft=N_FFT, hop_length=HOP_LENGTH, window='hann', center=True, pad_mode='reflect')
    stft = stft.T
    
    return stft


def mels_extractor(signal):

    N_FFT = 2048
    HOP_LENGTH = 512
    SAMPLING_RATE = 22500
  
  
    mels = librosa.feature.melspectrogram(y=signal, sr=SAMPLING_RATE, S=None, n_fft=N_FFT, hop_length=HOP_LENGTH)
    mels = mels.T
    
    return mels

Next, we load a random audio file and check the shapes of tensors for each of our features ( mfccs, stfts and mels) for an audio of duration 4 secs
Here, the sampling rate is 22500 which mean 1 secs of audio contains 22500 samples. Which means 4 secs of audio contains 22500*4 = 90000 samples. The hop length is 512. Thus each feature is calculated on a segment of 'hop length' of 512 samples. Thus we have 90,000/512 ~ 176 feature vectors for each audio sample. Mfcc's have 13 features for each audio segment (of length 512 samples), stfts have 1025 and mels have 128 features for each segment


In [55]:
NUM_MFCC = 13
N_FFT = 2048
HOP_LENGTH = 512
SAMPLING_RATE = 22500

signal, sr = librosa.load('/content/drive/MyDrive/urbansound/fold2/100652-3-0-0.wav',sr=SAMPLING_RATE)

mfcc = librosa.feature.mfcc(signal,sr=SAMPLING_RATE, n_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
mfcc = mfcc.T

stft = librosa.stft(signal, n_fft=N_FFT, hop_length=HOP_LENGTH, window='hann', center=True, pad_mode='reflect')
stft = stft.T

mels = librosa.feature.melspectrogram(y=signal, sr=SAMPLING_RATE, S=None, n_fft=N_FFT, hop_length=HOP_LENGTH)
mels = mels.T

print("Shape of mfcc tensor for each audio file of 4 secs duration : ", (len(mfcc), len(mfcc.T)))
print("Shape of stft tensor for each audio file of 4 secs duration : ",(len(stft), len(stft.T)))
print("Shape of mels tensor for each audio file of 4 secs duration : ",(len(mels), len(mels.T)))

Shape of mfcc tensor for each audio file of 4 secs duration :  (176, 13)
Shape of stft tensor for each audio file of 4 secs duration :  (176, 1025)
Shape of mels tensor for each audio file of 4 secs duration :  (176, 128)


Next, we create a function to itererate through all the audio files in all the directories and extract features(mfccs, stft's and mel spectrograms). We also divide each audio file into 4 segments to increase the number of input samples and to also ensure that each sample is of same size. We discard samples less than 1 sec in duration (1 sec corresponds to 22500 samples). All the features are then mapped to their corresponding labels in a sictionary which is then saved as a pkl file to be used later

In [None]:
def preprocess_data(dirpath, df):

      """ We divide each audio file into 4 segments of 1 sec duration.
          1 secs of audio has 22500 samples --> 4 secs of audio has 4*22500=90000 samples
          The number of segments =4, therefore each segment is of 1 sec duration and has 22500 samples
          first iteration start-->0*22500 =0, end--> 22500, 
          second iteration start--> 22500*1=22500, end= 22500 +22500=4500
          third iteration start--> 22500*2=45000, end= 45000 +22500=67500 """
        
    
    data = {
        "semantic_label": [],
        "labels" : [],
        "mfccs" : [],
         "mels" : [],
         "stft" : []   
         }
    
    NUM_OF_SEGMENTS =4
    
    for i in range(len(df)):
        filepath = os.path.join(dirpath, 'fold'+ str(df['fold'][i]), df['slice_file_name'][i])
        
        signal, sr = librosa.load(filepath, sr = 22500, res_type='kaiser_fast')
        
        NUM_OF_FRAMES_IN_SEGMENT = int(len(signal)/NUM_OF_SEGMENTS) 
      
        for num in range(NUM_OF_SEGMENTS):                    
                                                                      
            start = NUM_OF_FRAMES_IN_SEGMENT*num
            end = start + NUM_OF_FRAMES_IN_SEGMENT

            # we slice the signal into segment
            segment = signal[start:end]   

            # do not consider samples of duaration less than 1 secs
            if len(segment)==22500:                                     
                
                mfcc = mfccs_extractor(segment)
                mels = mels_extractor(segment)
                stft = stft_extractor(segment)

                semanticlabel = df['class'][i]

                class_label = mappings[semanticlabel]

                # We map features to their corresponding labels in the dictionary by appending them through a list

                data["mfccs"].append(mfcc)                             
                data["mels"].append(mels)
                data["stft"].append(stft)
                data["semantic_label"].append(semanticlabel)
                data["labels"].append(class_label)
                

    return data


def save_data(data):
    
    with open('pre_processed_data.pkl', 'wb') as output:
        pickle.dump(data, output)
  


In [None]:
def load_data(filepath):
    with open(filepath, 'rb') as pkl_file:
        dataset = pickle.load(pkl_file)
    
    
    return dataset

In [None]:
if __name__ == '__main__':
    
    dirpath = './'
    df = pd.read_csv('./UrbanSound8K.csv')
    
    dataset = preprocess_data(dirpath, df)
    
    save_data(dataset)
    
    

