In [1]:
import pandas as pd
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain
import librosa
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import os

# set the seed for reproducibility
np.random.seed(27)

## Pre-processing

In [2]:
SAMPLING_RATE = 16000   # sampling rate of the audio
N_MEL = 128             # number of Mel bands to generate
N_FFT = 1024            # length of the FFT window
HOP_LENGTH = 512        # number of samples between successive frames
TIME_DURATION = 5.      # time duration of the audio in seconds
NORMALIZE = True        # whether to normalize the spectrogram or not

# compute the number of samples in the audio
def get_num_samples(len_raw_audio):
    num_frames = len_raw_audio / HOP_LENGTH
    num_samples = int(num_frames)
    return num_samples

def get_mel_spectrogram(raw_audio):     
    """
    Function to extract mel spectrogram from raw audio

    Parameters:
        raw_audio: numpy array
            raw audio data
        sampling_rate: int
            sampling rate of the audio
        num_of_samples: int
            number of samples to pad or fix the length of the spectrogram
        normalize: bool
            whether to normalize the spectrogram or not

    Returns:
        melspectrogram_db: numpy array
            mel spectrogram in decibel units
    """   
    melspectrogram = librosa.feature.melspectrogram(y=raw_audio,
                                       sr=SAMPLING_RATE, 
                                       n_mels=N_MEL,
                                       n_fft=N_FFT,
                                       hop_length=HOP_LENGTH,
                                       )
    num_of_samples = get_num_samples(len(raw_audio))
    #print(f"Number of samples: {num_of_samples}")
    # convert a power spectrogram to decibel units (log-mel spectrogram)
    melspectrogram_db = librosa.power_to_db(melspectrogram, ref=np.max)
    melspectrogram_length = melspectrogram_db.shape[1]
    # pad or fix the length of spectrogram 
    if melspectrogram_length != num_of_samples:
        melspectrogram_db = librosa.util.fix_length(melspectrogram_db, 
                                                    size=num_of_samples, 
                                                    axis=1)
    # normalize 
    if NORMALIZE:
        scaler = MinMaxScaler(feature_range=(-1, 1))
        melspectrogram_db = scaler.fit_transform(melspectrogram_db)
    return melspectrogram_db

## Pre-processing
-) Extract data

-) Augment data

-) Save data

#### Metadata df

In [3]:
# set paths to metadata and audio files
metadata_path = 'ESC-50-master/meta/esc50.csv'
audio_path = 'ESC-50-master/audio/'

# import csv metadata into pandas dataframe
metadata = pd.read_csv(metadata_path)

# categories for esc10 = true with corresponding target labels

esc10_cat = metadata[metadata['esc10'] == True]['category'].unique()
# dataframe containing only esc10 categories with corrresponding target labels
esc10_metadata = metadata[metadata['category'].isin(esc10_cat)].copy()
esc10_metadata = esc10_metadata[['category', 'target']]
# remove duplicates
esc10_metadata = esc10_metadata.drop_duplicates()
esc10_metadata

Unnamed: 0,category,target
0,dog,0
24,chainsaw,41
54,crackling_fire,12
55,helicopter,40
62,rain,10
78,crying_baby,20
110,clock_tick,38
136,sneezing,21
141,rooster,1
148,sea_waves,11


In [4]:
# mapping of target labels to esc10 labels (0 -> 0, 10 -> 1, 11 -> 2, 20 -> 3, 38 -> 4, 21 -> 5, 40 -> 6, 41 -> 7, 1 -> 8, 12 -> 9)
map_dict =  {0: 0, 10: 1, 11: 2, 20: 3, 38: 4, 21: 5, 40: 6, 41: 7, 1: 8, 12: 9}

# keep only filename, fold, target when esc10 is true
esc10_metadata = metadata[metadata['esc10'] == True][['filename', 'fold', 'target']].copy()
# reset index
esc10_metadata.reset_index(drop=True, inplace=True)
# map target labels to esc10 labels
esc10_metadata['target'] = esc10_metadata['target'].map(map_dict)
esc10_metadata.head()

Unnamed: 0,filename,fold,target
0,1-100032-A-0.wav,1,0
1,1-110389-A-0.wav,1,0
2,1-116765-A-41.wav,1,7
3,1-17150-A-12.wav,1,9
4,1-172649-A-40.wav,1,6


#### Data augmentation

In [5]:
# perform data augmentation n times for each audio file, with one different augmentation each time
time_stretch = TimeStretch(min_rate=0.8, max_rate=1.2, p=1)
pitch_shift = PitchShift(min_semitones=-4, max_semitones=4, p=1)
shift = Shift(min_shift=-0.1, max_shift=0.1, p=1, shift_unit='fraction') # shift up to 10% of the total length, then +- 0.5s
augmentations = [time_stretch, shift, pitch_shift]

In [6]:
# Extract the audio files, apply augmentation, save dataframe

# delete the files if they already exist
if os.path.exists('esc10.pkl'):
    os.remove('esc10.pkl')

# create empty lists
features_esc10 = []

for index, row in esc10_metadata.iterrows():
    # load audio file
    audio, sr = librosa.load(audio_path + row['filename'], sr=SAMPLING_RATE, duration=TIME_DURATION)
    # apply augmentation
    for i in range(len(augmentations)+1):
        # don't augment the original audio and keep track of it with the column 'original'
        if i == 0:
            augmented_audio = audio
            original = True
        else:
            augmented_audio = augmentations[i-1](audio, sample_rate=SAMPLING_RATE)
            original = False
        mel = get_mel_spectrogram(augmented_audio)
        label = row['target']
        fold = row['fold']
        features_esc10.append([mel, label, fold, original])
    if index % 100 == 0 and index != 0: 
        print(f'Processed {index} files')

# create dataframes
esc10_mel = pd.DataFrame(features_esc10, columns=['mel_spectrogram', 'target', 'fold', 'original'])
# save the dataframe
esc10_mel.to_pickle('esc10.pkl')

Processed 100 files
Processed 200 files
Processed 300 files
