### Data Preprocessing:
here we select two overlaping 5 second audio segments from the start and end of the audio segment, assuming that the bird audio is likely to be present during the beginning and end of the audio file
then we split the dataset into training and test set

In [1]:

import os

import random
import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
import librosa
import soundfile

In [3]:
''' list of bird samples in path'''
path = os.path.join(os.getcwd(),'train_short_audio')
bird_samples = [name for name in os.listdir(path)]
bird_sample_numbers = [(name,len([name_1 for name_1 in os.listdir(os.path.join(path, name)) if os.path.isfile(os.path.join( os.path.join(path,name), name_1)) ])) for name in bird_samples ]
bird_sample_numbers

[('acafly', 132),
 ('acowoo', 190),
 ('aldfly', 227),
 ('ameavo', 44),
 ('amecro', 229),
 ('amegfi', 181),
 ('amekes', 82)]

In [4]:
class SplitAudio():
    ''' split the audio file to four 5 second snippets (2 clips in the
    beginning and 2 in the end with overlap)'''
    
    def __init__(self,sig_path,time_sample_size,sr = 32000,overlap_min = 0.05,overlap_max = 0.5):
        self.sig_path = sig_path
        self.time_sample_size = time_sample_size
        self.overlap_min = overlap_min
        self.overlap_max = overlap_max
        self.sr = sr
    
    def __call__ (self,save_path,bird,name):
        x,sr = librosa.load(os.path.join(self.sig_path,bird,name),sr = self.sr)
        total_duration = len(x)
        #seg = int(np.floor(total_duration/(img_time_diff*self.sr)))
        overlap = random.uniform(self.overlap_min,self.overlap_max)
        
        save_path_2 = os.path.join(save_path,name[:-4])
        seg_list = [0]
        
        if total_duration > (2 - overlap) * self.time_sample_size * self.sr:
            seg_list = seg_list + [int(np.ceil((1-overlap)*self.time_sample_size*self.sr))]
            
        if total_duration > 2*self.time_sample_size*self.sr:
            seg_list = seg_list + [int(np.floor(total_duration - ((1 - overlap)*self.time_sample_size + self.time_sample_size)*self.sr)),int(np.floor(total_duration - ( self.time_sample_size)*self.sr))]
        
        if not os.path.exists(save_path_2):
            os.makedirs(save_path_2)
            
        j = 0   
        for i in seg_list:  
            
            # Get start and stop sample
            s_start = i #int(max(0,(second - time_sample_size) * 32000))
            s_end = i + self.time_sample_size*self.sr#int( min(second * 32000,total_duration))
            
            out = os.path.join(save_path_2,"mel_"+str(j)+"_"+name[:-4]+".ogg")
            j+=1
            
            soundfile.write(out,x[s_start:s_end],samplerate = self.sr)
            

#### Generate Audio chunks

In [5]:
segmented_audio_path = os.getcwd() + '\\train_samples'
sig_path = os.getcwd() + '\\train_short_audio'

if not os.path.exists(sig_path):
    os.makedirs(sig_path)
        
time_sample_size = 5
split_audio = SplitAudio(sig_path,time_sample_size)

for bird in bird_samples:
    
    save_path = os.path.join(segmented_audio_path,bird)
    
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    file_list = [name for name in os.listdir(os.path.join(sig_path, bird)) ]
    
    for name in file_list:
        split_audio(save_path,bird,name)
            # Compute the spectrogram and apply the mel scale

          

In [33]:
'''clip nocall files from train soundscapes. These files would be added later for audio augmentation as a source of noise'''

sc_list = pd.read_csv('train_soundscape_labels.csv')
sc_list = sc_list[sc_list.birds == 'nocall']
sc_list["fileprefix"] = sc_list["audio_id"].apply(str)+"_"+sc_list["site"].apply(str)

path = os.getcwd() + '\\train_soundscapes'

def getprefix(x):
    x = x.split("_")
    return x[0]+"_"+x[1]

sc_audio_names = pd.DataFrame(data =  [name for name in os.listdir(path)],columns = ["filename"])
sc_audio_names["fileprefix"] = sc_audio_names.apply(lambda x: getprefix(x[0]) ,axis = 1) 


In [37]:
i = 0
outpath = os.path.join(os.getcwd(),"train_samples")

if not os.path.exists(outpath):
    os.makedirs(outpath)

for _,row in sc_audio_names.iterrows():
    y,_ = librosa.load(os.path.join(path,row[0]),sr = 32000)
    
    out_path_1 = os.path.join(outpath,'nocall',row[1])
    if not os.path.exists(out_path_1):
        os.makedirs(out_path_1)
        
    for _,subrow in sc_list[sc_list.fileprefix == row[1]].iterrows():
        
        s_start = (subrow[3] - 5)*32000 #int(max(0,(second - time_sample_size) * 32000))
        s_end = subrow[3]*32000
        out = os.path.join(out_path_1,subrow[0]+".ogg")
        soundfile.write(out,y[s_start:s_end],samplerate = 32000)

        

filename      10534_SSW_20170429.ogg
fileprefix                 10534_SSW
Name: 0, dtype: object
filename      11254_COR_20190904.ogg
fileprefix                 11254_COR
Name: 1, dtype: object
filename      14473_SSW_20170701.ogg
fileprefix                 14473_SSW
Name: 2, dtype: object
filename      18003_COR_20190904.ogg
fileprefix                 18003_COR
Name: 3, dtype: object
filename      20152_SSW_20170805.ogg
fileprefix                 20152_SSW
Name: 4, dtype: object
filename      21767_COR_20190904.ogg
fileprefix                 21767_COR
Name: 5, dtype: object
filename      26709_SSW_20170701.ogg
fileprefix                 26709_SSW
Name: 6, dtype: object
filename      26746_COR_20191004.ogg
fileprefix                 26746_COR
Name: 7, dtype: object
filename      2782_SSW_20170701.ogg
fileprefix                 2782_SSW
Name: 8, dtype: object
filename      28933_SSW_20170408.ogg
fileprefix                 28933_SSW
Name: 9, dtype: object
filename      31928_COR_20191004

#### Arange files and split into test and training set

In [61]:
segmented_audio_path = os.getcwd() + '\\train_samples'
sig_path = os.getcwd() + '\\train_short_audio'
#create list of images with label
birds = [name for name in os.listdir(segmented_audio_path)]
bird_numbers = [[(name,name_1) for name_1 in os.listdir(os.path.join(segmented_audio_path, name))  ] 
                       for name in birds ]

bird_numbers = [name for sublist in bird_numbers for name in sublist]
bird_numbers = [[(bird,name,name_1) for name_1 in os.listdir(os.path.join(segmented_audio_path,bird, name)) ]
                      for bird,name in bird_numbers]
bird_numbers = [name for sublist in bird_numbers for name in sublist]


In [62]:
train_metadata_1 = pd.DataFrame(data = bird_numbers,columns = ['primary_label','folder','filename'])
train_metadata_1['key'] = train_metadata_1['primary_label']+train_metadata_1['folder']+'.ogg'

train_metadata_2 = pd.read_csv('train_metadata.csv') 
train_metadata_2['key'] = train_metadata_2['primary_label'].astype(str)+train_metadata_2['filename'].astype(str)

train_metadata = train_metadata_1.set_index(['key']).join(train_metadata_2.set_index(['key']),on = 'key',lsuffix = '',rsuffix='_y',how = 'left').reset_index()[['primary_label','folder','secondary_labels','filename']]
train_metadata.replace(np.nan,'[]',inplace = True)

In [63]:
#create train_dev and test set
train_metadata['secondary_labels'] = train_metadata['secondary_labels'].apply(lambda x: x.replace("[","").replace("]","").replace("'","").replace(" ","").split(","))
valid_labels = train_metadata.primary_label.unique()
train_metadata['secondary_labels'] = train_metadata['secondary_labels'].apply(lambda x: list(set(x) & set(valid_labels)))

metadata_to_split = train_metadata.loc[:,['folder','primary_label']].drop_duplicates()
x_train_dev,x_test,y_train_dev,y_test = train_test_split(metadata_to_split['folder'],metadata_to_split['primary_label'],test_size = 0.05,stratify = metadata_to_split['primary_label'])

train_dev = train_metadata[train_metadata['folder'].isin(x_train_dev.to_list())]
test =  train_metadata[train_metadata['folder'].isin(x_test.to_list())]

#save train and test csv's
train_dev.reset_index(inplace = True)
test.reset_index(inplace = True)

In [64]:
#split train_dev to train and dev sets
metadata_to_split = train_dev.loc[:,['folder','primary_label']].drop_duplicates()
x_train,x_dev,y_train,y_dev = train_test_split(metadata_to_split['folder'],metadata_to_split['primary_label'],test_size = 0.1,stratify = metadata_to_split['primary_label'])

train = train_dev[train_dev['folder'].isin(x_train.to_list())]
dev =  train_dev[train_dev['folder'].isin(x_dev.to_list())]

#save train and test csv's
train.reset_index(inplace = True)
dev.reset_index(inplace = True)

In [60]:
bird

'amekes'

In [65]:
base_dir = os.getcwd() + '\\train_test_dev_set'
copy_dir =  os.getcwd() + '\\train_samples'

os.makedirs(os.path.join(base_dir,'train'))
os.makedirs(os.path.join(base_dir,'test'))
os.makedirs(os.path.join(base_dir,'dev'))

train.to_csv(os.path.join(base_dir,'train','train.csv'))
test.to_csv(os.path.join(base_dir,'test','test.csv'))
dev.to_csv(os.path.join(base_dir,'dev','dev.csv'))

import shutil

for bird in birds:
    train_bird_to = os.path.join(base_dir,'train',bird)
    test_bird_to = os.path.join(base_dir,'test',bird)
    dev_bird_to = os.path.join(base_dir,'dev',bird)
    
    os.makedirs(train_bird_to)
    os.makedirs(test_bird_to)
    os.makedirs(dev_bird_to)
    
    copy_files_from = os.path.join(copy_dir,bird)
    train_copy = train[train['primary_label']==bird].loc[:,['folder','filename']]
    test_copy = test[test['primary_label']==bird].loc[:,['folder','filename']]
    dev_copy = dev[dev['primary_label']==bird].loc[:,['folder','filename']]
    
    for i,train_row in train_copy.iterrows():
        shutil.copy(os.path.join(copy_files_from,train_row[0],train_row[1]),train_bird_to)
    
    for i,test_row in test_copy.iterrows():
        shutil.copy(os.path.join(copy_files_from,test_row[0],test_row[1]),test_bird_to)
        
    for i,dev_row in dev_copy.iterrows():
        shutil.copy(os.path.join(copy_files_from,dev_row[0],dev_row[1]),dev_bird_to)

### References:
https://www.kaggle.com/hidehisaarai1213/pytorch-training-birdclef2021-starter
https://www.kaggle.com/hidehisaarai1213/birdclef2021-infer-between-chunk
https://www.kaggle.com/hidehisaarai1213/introduction-to-sound-event-detection
