In [1]:
import pandas as pd
import soundfile as sf
import os
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [2]:
df = pd.read_csv('train_metadata.csv')
df.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,date,filename,license,rating,time,url
0,acafly,['amegfi'],"['begging call', 'call', 'juvenile']",35.386,-84.125,Empidonax virescens,Acadian Flycatcher,Mike Nelson,2012-08-12,XC109605.ogg,Creative Commons Attribution-NonCommercial-Sha...,2.5,09:30,https://www.xeno-canto.org/109605
1,acafly,[],['call'],9.1334,-79.6501,Empidonax virescens,Acadian Flycatcher,Allen T. Chartier,2000-12-26,XC11209.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,?,https://www.xeno-canto.org/11209
2,acafly,[],['call'],5.7813,-75.7452,Empidonax virescens,Acadian Flycatcher,Sergio Chaparro-Herrera,2012-01-10,XC127032.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.0,15:20,https://www.xeno-canto.org/127032
3,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129974.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129974
4,acafly,['whwbec1'],['call'],4.6717,-75.6283,Empidonax virescens,Acadian Flycatcher,Oscar Humberto Marin-Gomez,2009-06-19,XC129981.ogg,Creative Commons Attribution-NonCommercial-Sha...,3.5,07:50,https://www.xeno-canto.org/129981


In [3]:
def loop(indices):
    indices, _ = indices
    df = pd.read_csv('train_metadata.csv')
    data = []
    for i in tqdm(indices):
        audio_filename = os.path.join('birdsound', df['primary_label'].iloc[i], df['filename'].iloc[i])
        if not os.path.exists(audio_filename):
            continue
        
        y, sr = sf.read(audio_filename)
        if (len(y) / sr) >= 30:
            continue
            
        d = df.iloc[i].to_dict()
        d['len'] = len(y) / sr
        d['audio_filename'] = audio_filename
        
        data.append(d)
    return data

In [4]:
processed = loop((list(range(10)), 0))

100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.63it/s]


In [5]:
processed = multiprocessing(list(range(len(df))), loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:09<00:00, 24.36it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:00<00:00, 21.76it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:15<00:00, 23.26it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:16<00:00, 23.05it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:21<00:00, 22.26it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:21<00:00, 22.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:22<00:00, 22.08it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3143/3143 [02:23<00:00, 21.92it/s]


In [6]:
len(processed)

27740

In [7]:
import json

with open('BirdCLEF-2021.json', 'w') as fopen:
    json.dump(processed, fopen)