## 1. Librarie's import

In [1]:
from os import listdir
from os.path import isfile, join

from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import numpy as np
import pandas as pd

import soundfile as sf
from scipy import signal
import random
import librosa

## 2. Loading OGG files (audio files)

In [2]:
def load_ogg_file(audio, max_ms = 5000, sr = 10989):
    # Load encoded wav file
    x, fs = audio
    # Sampling frequency of the ADC
    M = fs//sr
    # Resampling the audio file from 44100 Hz to 10200
    sig = signal.resample(x, int(len(x)/M))
    
    sig_len = len(sig)
    max_len = int(sr * max_ms/1000)

    if (sig_len > max_len):
        sig = sig[:max_len]

    elif (sig_len < max_len):
        # Length of padding to add at the beginning and end of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len

        # Pad with 0s
        pad_begin = np.zeros(pad_begin_len)
        pad_end = np.zeros(pad_end_len)

        sig = np.concatenate((pad_begin, sig, pad_end))
        
    return sig

## 3. Dataframe with audio resampled

In [3]:
soundfiles = ['203 - Crackling fire', '205 - Chirping birds', '501 - Helicopter', '502 - Chainsaw', '510 - Hand saw' ]
original_audio = np.array([[mypath + "/" + f for f in listdir(mypath) if isfile(join(mypath, f))] for mypath in ["Dataset_ESC-50/" + soundfiles[i] for i in range(len(soundfiles))]])
audio_df = pd.DataFrame(data = original_audio.T, columns = soundfiles)

## 4. Melspectrograms

In [8]:
def melspectrogram(x, Nft = 512, fs_down = 10447, Nmel = 20) :
    """
    Pre : x is the resampled signal
    Post : melspectrogram of x
    """

    L = len(x)
    x_crop = x[:L-L%Nft]
    x_new = x_crop if len(x.shape)==1 else np.mean(x_crop,axis=1)
    L_new = len(x_new)
    
    audiomat = np.reshape(x_new, (L_new//Nft,Nft))
    audioham = audiomat*np.hamming(Nft) # Windowing.
    z = np.reshape(audioham,-1) # y windowed by pieces

    stft = np.fft.fft(audioham, axis=1)
    stft = np.abs(stft[:,:Nft//2].T) # Taking only positive frequencies and computing the magnitude
    
    mels = librosa.filters.mel(sr=fs_down, n_fft=Nft, n_mels=Nmel)
    mels = mels[:,:-1]
    mels = mels/np.max(mels)
    
    melspec = mels@stft
    
    return melspec

## 5. Creating an entire dataset from audio path

### 5.1 Audio transformations

In [180]:
def time_shift(filename, shift_limit=0.4):
    sig = load_ogg_file(sf.read(filename))
    sig_len = len(sig)
    shift_amt = int(random.random() * shift_limit * sig_len)
    return np.roll(sig, shift_amt)

In [181]:
def scaling(filename, scaling_limit=5):
    sig = load_ogg_file(sf.read(filename))
    sig = np.random.uniform(0,scaling_limit)*sig
    return sig

In [182]:
def add_noise(filename, sigma=0.05):
    sig = load_ogg_file(sf.read(filename))
    size = len(sig)
    random_list = np.random.normal(loc=0.0, scale=sigma, size=size)
    return sig

### 5.2 Dataset matching specifications
- ADC samples the signal received from the microphone at a 10989 Hz
- 20 melvectors are computed with ONLY 10 components (here we have 107)

In [64]:
def shaping_audio(audio, dic, col, nb = 10, sr=10989) :
    sig = audio
    sig_len = sig.shape[0]
    L = sig_len//nb
    for i in range(nb) :
        mels = melspectrogram(sig[i*L:(i+1)*L])
        dic[col].append(mels)

In [63]:
def data_shape(_df) :
    dic = {}
    for i in _df.columns :
        print(i)
        dic[i] = []
        for sig in _df[i].values :
            shaping_audio(sig, dic, i)
    return pd.DataFrame.from_dict(dic)

In [206]:
def dataset(paths, labels, transform = False) :
    df = pd.DataFrame(data = paths, columns = labels) # sound paths
    df_original = df.applymap(lambda x :  sf.read(x)) # sounds
    df_original = data_shape(df_original.applymap(load_ogg_file)) # shaping sounds
    
    lst = [df_original]
    for i in range(30) :
        print("computing dataset {}".format(i))
        noise_df = df.applymap(add_noise)
        scaling_df = df.applymap(scaling)
        time_df = df.applymap(time_shift)
        lst.append(data_shape(noise_df))
        lst.append(data_shape(scaling_df))
        lst.append(data_shape(time_df))
    
    return pd.concat(lst, axis = 0, ignore_index=True)

In [207]:
def aug_audio_dataset() :
    _df = dataset(paths = original_audio.T, labels = soundfiles)
    frames = []
    for i in range(len(_df.columns)) :
        for j in _df[_df.columns[i]] :
            tmp = pd.DataFrame(j.reshape(200,))
            tmp = tmp.T
            tmp['label'] = i
            frames.append(tmp)
    _df =  pd.concat(frames, ignore_index = True)
    _df.iloc[:, -1] = _df.iloc[:, -1].map(lambda x : int(x))
    return _df

In [2]:
#aug_audio_df = aug_audio_dataset()

In [210]:
for i in aug_audio_df.columns :
    aug_audio_df[i] /= np.max(aug_audio_df[i])

In [3]:
#aug_audio_df.describe()

## 6. Creating entire dataset from melspectrograms augmentation

In [76]:
pip install audiomentations

Collecting audiomentations
  Downloading audiomentations-0.24.0-py3-none-any.whl (67 kB)
Installing collected packages: audiomentations
Successfully installed audiomentations-0.24.0
Note: you may need to restart the kernel to use updated packages.


In [170]:
df = pd.read_csv("dataset.csv")

In [20]:
mel = df.iloc[0,:-1].values.reshape(20,10)

In [30]:
from audiomentations import SpecCompose, SpecChannelShuffle, SpecFrequencyMask
import numpy as np

augment = SpecCompose( [SpecFrequencyMask(p=0.5),])

In [31]:
# Augment/transform/perturb the spectrogram
augmented_spectrogram = augment(mel)

In [173]:
def aug_spec_dataset(df) :
    _df = df.copy()
    frames = []
    for i in range(15) :
        print(i)
        tmp_df = _df.copy()
        for j in range(tmp_df.shape[0]) :
            print(j)
            mel = tmp_df.iloc[j,:-1].values.reshape(20,10)
            augment = SpecCompose( [SpecFrequencyMask(p=np.random.random()),])
            augmented_spectrogram = augment(mel)
            tmp_df.iloc[j, :-1] = augmented_spectrogram.reshape(200,)
        frames.append(tmp_df)
            
    aug_df =  pd.concat(frames)
    aug_df.iloc[:, -1] =aug_df.iloc[:, -1].map(lambda x : int(x))
    return aug_df

In [1]:
#spec_df = aug_spec_dataset(df)

In [199]:
for i in spec_df.columns :
    spec_df[i] /= np.max(spec_df[i])

In [2]:
#spec_df.describe()

In [204]:
spec_df.to_csv("spec_dataset.csv", index = False)

In [159]:
def concat(df1, df2) :
    lst = []
    for i in range(df1.shape[0]) :
        lst.append(list(df1.iloc[i, :].values))
    
    for i in range(df2.shape[0]) :
        lst.append(list(df2.iloc[i, :].values))
        
    _df = pd.DataFrame(np.array(lst))
    return _df

In [216]:
final_df = concat(spec_df, aug_audio_df)

In [217]:
final_df.iloc[:, -1] =final_df.iloc[:, -1].map(lambda x : int(x))