In [12]:
import json
import os
import librosa
import soundfile as sf
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [2]:
with open('CompA-R.json') as fopen:
    train = json.load(fopen)

In [7]:
!mkdir compa_r_train_audios-mp3

In [3]:
len(train)

198648

In [4]:
train[0]

{'instruction': 'Analyze the frequency and duration of the revving sounds in the audio. Based on these characteristics, infer the type of vehicle producing these sounds.',
 'output': 'The frequent and lengthy revving sounds suggest a powerful vehicle, likely a race car or motorcycle, which fits the context of a race car event.',
 'audio_id': './compa_r_train_audios/YBaw0jIZ0STo.wav',
 'input': '',
 'dataset': 'Audioset_Strong',
 'task': 'open-ended question'}

In [16]:
def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        if not os.path.exists(r['audio_id']):
            continue
        audio_filename = os.path.join(
            'compa_r_train_audios-mp3', 
            os.path.split(r['audio_id'])[1].replace('.wav', '.mp3'))
        
        if not os.path.exists(audio_filename):
            y, sr = librosa.load(r['audio_id'], sr = 16000)
            sf.write(audio_filename, y, sr)
        
        data.append({
            'question': r['instruction'],
            'answer': r['output'],
            'audio_filename': audio_filename,
            'metadata': json.dumps(r),
        })
    return data

In [17]:
processed = loop((train[:10], 0))

100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 55.16it/s]


In [19]:
processed = multiprocessing(train, loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:33<00:00, 64.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 58052.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:42<00:00, 61.22it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:42<00:00, 61.08it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:44<00:00, 60.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:53<00:00, 57.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [02:54<00:00, 56.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9932/9932 [03:04<00:00, 53.90it/s]


In [20]:
!du -hs compa_r_train_audios-mp3

2.9G	compa_r_train_audios-mp3


In [21]:
!rm -rf compa_r_train_audios

In [22]:
len(processed)

198648

In [23]:
processed[10]

{'question': "The sounds of keypress tones and generic impact sounds are interspersed through the audio. Analyze these and infer what they might indicate about the telephone operation and the speaker's actions during the call.",
 'answer': 'The speaker might be navigating through automated menus before and during the call, indicated by the keypress tones. The generic impact sounds could signify basic desk-related activities, like pen-clicking or paper shuffling.',
 'audio_filename': 'compa_r_train_audios-mp3/Y349kbyfz0qU.mp3',
 'metadata': '{"instruction": "The sounds of keypress tones and generic impact sounds are interspersed through the audio. Analyze these and infer what they might indicate about the telephone operation and the speaker\'s actions during the call.", "output": "The speaker might be navigating through automated menus before and during the call, indicated by the keypress tones. The generic impact sounds could signify basic desk-related activities, like pen-clicking or 

In [25]:
with open('CompA-R-test.json') as fopen:
    test = json.load(fopen)
    
len(test)

494

In [26]:
!mkdir compa_r_test_audios-mp3

In [39]:
def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = os.path.join('filtered_audios', r['audio_id'])
        if not os.path.exists(f):
            continue
        audio_filename = os.path.join(
            'compa_r_test_audios-mp3', 
            os.path.split(r['audio_id'])[1].replace('.wav', '.mp3'))
        
        if not os.path.exists(audio_filename):
            y, sr = librosa.load(f, sr = 16000)
            sf.write(audio_filename, y, sr)
        
        data.append({
            'question': r['instruction_output'][0]['instruction'],
            'answer': r['instruction_output'][0]['output'],
            'audio_filename': audio_filename,
            'metadata': json.dumps(r),
        })
    return data

In [40]:
test_processed = loop((test, 0))

100%|█████████████████████████████████████████████████████████████████████████████████████| 494/494 [00:33<00:00, 14.78it/s]


In [41]:
from datasets import Dataset

dataset = Dataset.from_list(processed)

In [46]:
dataset.push_to_hub('mesolitica/CompA-R-Instructions', split = 'train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/199 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/60.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CompA-R-Instructions/commit/6a515fee92b73408de769166f3984836967227a2', commit_message='Upload dataset', commit_description='', oid='6a515fee92b73408de769166f3984836967227a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CompA-R-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CompA-R-Instructions'), pr_revision=None, pr_num=None)

In [47]:
dataset = Dataset.from_list(test_processed)

In [48]:
dataset.push_to_hub('mesolitica/CompA-R-Instructions', split = 'test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/415k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/400 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CompA-R-Instructions/commit/ce9bb7b9a98474f68f8bf2c1d08f1a1a16311acd', commit_message='Upload dataset', commit_description='', oid='ce9bb7b9a98474f68f8bf2c1d08f1a1a16311acd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CompA-R-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CompA-R-Instructions'), pr_revision=None, pr_num=None)

In [1]:
from glob import glob

audio_files = glob('compa_r_*_audios-mp3/*.mp3')
len(audio_files)

62613

In [2]:
import zipfile

with zipfile.ZipFile('compa_r.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [None]:
!huggingface-cli upload mesolitica/CompA-R-Instructions compa_r.zip --repo-type=dataset

Uploading files using Xet Storage..
Uploading...:  99%|████████████████████████▊| 2.87G/2.89G [00:20<00:00, 574MB/s]