In [1]:
## check availability of GPU (nvidia-smi)
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

import warnings
warnings.filterwarnings("ignore")

import os 
import glob

current_dir = "/mnt/akoustos/"

data_dir = current_dir + "Data/"
labeled_data_dir = data_dir + 'Labeled_Data/'
audio_dir = data_dir + "Raw_Audio/"

spectrogram_dir = data_dir + "Extracted_Spectrogram/"
if not os.path.exists(spectrogram_dir):
    os.makedirs(spectrogram_dir)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: Tesla V100-SXM2-32GB, compute capability 7.0


### module: Upload audio data

In [2]:
from src.load_data import Load_Data
audio_filenames = Load_Data.audio_filenames(directory = audio_dir)

Number of audio files: 37414
Time spent to upload audio files:  17.348192 seconds


### module: Upload labeled data

In [3]:
from src.load_data import Load_Data
labeled_data = Load_Data.labeled_data(labeled_data_dir = labeled_data_dir, audio_dir = audio_dir)

   Category  Count  Percentage
0        14    500       21.83
1        25    500       21.83
2      2662    407       17.77
3      4949    501       21.88
4      5620    382       16.68
Time spent to upload labeled data:  0.161514 seconds


### module: Sound Event Detection 
#### this module needs "audio_filenames" and "labeled_data" from previous two modules

In [9]:
from datetime import datetime
import glob
import pandas as pd
import numpy as np
import os
from statistics import median
import cv2
import warnings
from math import ceil, floor
from joblib import Parallel, delayed
import multiprocessing
from multiprocessing import Pool

from src.audio import Audio
from src.preprocessing import speechproc
from src.preprocessing import spectrogating
from copy import deepcopy
from scipy.signal import lfilter
    

def sound_event_detection_for_single_audio_file(annotation_base_audio_filename):
    df = pd.DataFrame(columns = list(labeled_data))
    matching_audio_filename = [audio_filename for audio_filename in audio_filenames if os.path.basename(audio_filename) == annotation_base_audio_filename]
    audio = Audio.load(matching_audio_filename.pop())
    
    ## sound event detection
    noise = audio.samples[0:1*audio.sample_rate]
    x_dn = spectrogating.removeNoise(audio_clip=audio.samples, 
                                     noise_clip=noise,
                                     n_grad_freq=2,
                                     n_grad_time=4,
                                     n_fft=2048,
                                     win_length=2048,
                                     hop_length=512,
                                     n_std_thresh=2.5,
                                     prop_decrease=1.0,
                                     verbose=False,
                                     visual=False)

    winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 2048
    ftThres = 0.4
    vadThres = 0.2
    opts = 1
    ft, flen, fsh10, nfr10 = speechproc.sflux(x_dn, audio.sample_rate, winlen, ovrlen, nftt)
    # --spectral flatness --
    pv01 = np.zeros(nfr10)
    pv01[np.less_equal(ft, ftThres)] = 1 
    pitch = deepcopy(ft)
    pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
    # --filtering--
    ENERGYFLOOR = np.exp(-50)
    b = np.array([0.9770,   -0.9770])
    a = np.array([0.3,   -0.3])
    fdata = lfilter(b, a, x_dn, axis=0)
    vad_seg = speechproc.snre_vad(fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres)  
    no_events_starttime = [0] + [i / len(vad_seg) * audio.duration() for i in range(len(vad_seg)) if vad_seg[i] == 0 and vad_seg[i-1] == 1]
    no_events_endtime = [i / len(vad_seg) * audio.duration() for i in range(len(vad_seg)) if vad_seg[i] == 1 and vad_seg[i-1] == 0] + [audio.duration()]
    for start, end in zip(no_events_starttime, no_events_endtime):
        new_row = {'Begin Time (s)': start,
                   'End Time (s)': end,
                   'Low Freq (Hz)': 0,
                   'High Freq (Hz)': 0,
                   'Begin File': annotation_base_audio_filename,
                   'Category': 'No Sound Event'}
        df = df.append(new_row, ignore_index=True)
    return df

begin = datetime.now()

annotation_base_audio_filenames = list(labeled_data['Begin File'].unique())
num_cores = multiprocessing.cpu_count()
with Pool(processes=num_cores) as pool:
    df_list = pool.map(sound_event_detection_for_single_audio_file, annotation_base_audio_filenames)
    no_sound_event_data = pd.concat(df_list, ignore_index=True)

no_sound_event_data['duration'] = no_sound_event_data['End Time (s)'] - no_sound_event_data['Begin Time (s)'] 
no_sound_event_data = no_sound_event_data.sort_values(by='duration',ascending=False)[: len(labeled_data) // len(labeled_data.Category.unique())]
no_sound_event_data = no_sound_event_data.drop(['duration'], axis = 1)
 

labeled_data = labeled_data.append(no_sound_event_data, ignore_index=True)
summary = labeled_data.groupby(['Category']).size().reset_index(name='Count')
summary['Percentage'] = round(100 * summary['Count']  / summary['Count'].sum(), 2)
print(summary)
        
end = datetime.now()
print('Time spent to preprocess data: ', (end - begin).total_seconds(), 'seconds')
        

         Category  Count  Percentage
0              14    500       17.44
1              25    500       17.44
2            2662    407       14.20
3            4949    501       17.47
4            5620    382       13.32
5  No Sound Event    577       20.13
Time spent to preprocess data:  800.842548 seconds


### module: Generate spectrograms

In [10]:
from src.spectrogram import Spectrogram
Spectrogram.clear_space(spectrogram_dir)  ## if don't want to keep previously generated spectrograms, clear space
Spectrogram.generate_spectrograms_parallel(spectrogram_duration = 4, 
                                           labeled_data = labeled_data, 
                                           audio_filenames = audio_filenames, 
                                           save_to_dir = spectrogram_dir,
                                           axis=False, 
                                           sr = 22050, 
                                           hop_length=512, 
                                           fmin=None, 
                                           fmax=None, 
                                           x_axis='time', 
                                           y_axis='linear', 
                                           cmap ='viridis')

Time spent to generate spectrograms with parallelization:  218.535261 seconds
Total number of spectrograms produced: 2821


In [11]:
from src.load_data import Load_Data
specctrogram_vector, spectrogram_filenames = Load_Data.load_spectrograms(directory=spectrogram_dir)

number of valid spectrograms: 2821
shape of vector for valid spectrograms: (224, 224, 3)
Time spent to load spectrograms as array:  417.726772 seconds
