In [1]:
import json
import soundfile as sf
import pandas as pd
import os
from tqdm import tqdm

def new_path(f):
    f = f.replace('.mp3', '.audioset-0.25')
    splitted = f.split('/')
    base_folder = splitted[0] + '_audioset-0.25'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

from multiprocess import Pool
import itertools


def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
    
df = pd.read_csv('class_labels_indices.csv')
mapping = {}
labels = {}
for i in range(len(df)):
    mapping[df['mid'].iloc[i]] = df['index'].iloc[i]
    labels[df['index'].iloc[i]] = df['display_name'].iloc[i]

In [2]:
!mkdir slice-0.25

In [3]:
with open('populate.json') as fopen:
    data = json.load(fopen)

In [4]:
def loop(indices):
    indices, _ = indices
    results = []
    for n in tqdm(indices):
        row = data[n]
        row_labels = set([labels[i] for i in row['labels']])
        
        f_audioset = new_path(row['audio_filename'])
        if not os.path.exists(f_audioset):
            continue
        
        with open(f_audioset) as fopen:
            d = json.load(fopen)

        y, sr = sf.read(row['audio_filename'])
        maxlen = len(y) / sr
        
        for i in range(len(d)):
            end = min(float(d[i]['timestamp']) + 0.25, maxlen)
            d[i]['start'] = float(d[i].pop('timestamp'))
            d[i]['end'] = end
            subset = set(d[i]['topk']) & set(row_labels)
            try:
                scores = {l: d[i]['scores'][no] for no, l in enumerate(d[i]['topk'])}
            except Exception as e:
                print(e)
                print(d[i])
                continue
            if len(subset):
                f = os.path.join('slice-0.25', f'{n}-{i}.mp3')
                if not os.path.exists(f):
                    y_ = y[int(d[i]['start'] * sr): int(d[i]['end'] * sr)]
                    sf.write(f, y_, sr)
                subset = list(subset)
                scores_ = [scores[l] for l in subset]
                results.append({
                    'audio_filename': f,
                    'labels': subset,
                    'scores': scores_
                })
    
    return results

In [5]:
r = loop((range(10), 0))

100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14.21it/s]


In [6]:
r[0]

{'audio_filename': 'slice-0.25/0-0.mp3',
 'labels': ['Speech'],
 'scores': [-0.31396]}

In [7]:
r = multiprocessing(range(len(data)), loop, cores = 20)

100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:28<00:00,  4.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:28<00:00,  4.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:28<00:00,  4.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:28<00:00,  4.39it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:29<00:00,  4.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:02<00:00,  7.04it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:31<00:00,  4.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1971/1971 [07:32<00:00,  4.36it/s]
100%|███

In [8]:
r[0]

{'audio_filename': 'slice-0.25/0-0.mp3',
 'labels': ['Speech'],
 'scores': [-0.31396]}

In [9]:
!du -hs slice-0.25

4.0G	slice-0.25


In [10]:
with open('slice-0.25.json', 'w') as fopen:
    json.dump(r, fopen)