In [12]:
from glob import glob
import os
import librosa
import soundfile as sf
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [5]:
!mkdir CochlScene-audio

In [2]:
ls CochlScene

[0m[01;34mTest[0m/  [01;34mTrain[0m/  [01;34mVal[0m/


In [23]:
from tqdm import tqdm

def loop(files):
    files, _ = files
    data = []
    for f in tqdm(files):
        audio_filename = os.path.join('CochlScene-audio', f.replace('/', '_')).replace('.wav', '.mp3')
        if not os.path.exists(audio_filename):
            y, sr = librosa.load(f, sr = 16000)
            sf.write(audio_filename, y, sr)
        
        data.append({
            'audio_filename': audio_filename,
            'answer': os.path.split(f)[1].split('_')[0],
            'metadata': ''
        })
    return data

In [24]:
train_files = glob('CochlScene/Train/*/*.wav')
train = multiprocessing(train_files, loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:09<00:00, 16.07it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:09<00:00, 16.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 16.08it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:10<00:00, 15.99it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:10<00:00, 15.98it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:11<00:00, 15.88it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:11<00:00, 15.88it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3042/3042 [03:11<00:00, 15.88it/s]


In [25]:
len(train)

60855

In [26]:
train[0]

{'audio_filename': 'CochlScene-audio/CochlScene_Train_Street_Street_user0765_14844491_001.mp3',
 'answer': 'Street',
 'metadata': ''}

In [29]:
labels = str(list(set([r['answer'] for r in train])))

In [30]:
import random

questions = [
    'given the labels\n{labels}\n\nclassify the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

for i in range(len(train)):
    train[i]['question'] = random.choice(questions).format(labels = labels)

In [31]:
train[0]

{'audio_filename': 'CochlScene-audio/CochlScene_Train_Street_Street_user0765_14844491_001.mp3',
 'answer': 'Street',
 'metadata': '',
 'question': "given the labels\n['Park', 'Car', 'Kitchen', 'Restaurant', 'Subway', 'Cafe', 'Elevator', 'Street', 'SubwayStation', 'Bus', 'ResidentialArea', 'CrowdedIndoor', 'Restroom']\n\nclassify the audio"}

In [32]:
test_files = glob('CochlScene/Test/*/*.wav')
test = multiprocessing(test_files, loop, cores = 20)

100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:25<00:00, 15.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 16.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.67it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.40it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:26<00:00, 14.23it/s]


In [34]:
for i in range(len(test)):
    test[i]['question'] = random.choice(questions).format(labels = labels)

In [33]:
val_files = glob('CochlScene/Val/*/*.wav')
val = multiprocessing(val_files, loop, cores = 20)

100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:25<00:00, 14.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:26<00:00, 14.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:26<00:00, 14.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:26<00:00, 14.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:26<00:00, 14.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 378/378 [00:26<00:00, 14.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████████▊| 377/378 [00:26<00:00, 16.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 15.84it/s]


In [35]:
for i in range(len(val)):
    val[i]['question'] = random.choice(questions).format(labels = labels)

In [36]:
len(train), len(test), len(val)

(60855, 7687, 7573)

In [37]:
from datasets import Dataset

dataset = Dataset.from_list(train)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'CochlScene_train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/936k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.97k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/2483209148bcf8132d2859123332cbc234edc1e9', commit_message='Upload dataset', commit_description='', oid='2483209148bcf8132d2859123332cbc234edc1e9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [38]:
dataset = Dataset.from_list(test)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'CochlScene_test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/104k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/d17921028984254584ef312355b49163079422ed', commit_message='Upload dataset', commit_description='', oid='d17921028984254584ef312355b49163079422ed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [39]:
dataset = Dataset.from_list(val)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'CochlScene_val')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/99.9k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/9d8fb45810e2c7e2980a2e526b29832855eca3c1', commit_message='Upload dataset', commit_description='', oid='9d8fb45810e2c7e2980a2e526b29832855eca3c1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [40]:
from glob import glob

audio_files = glob('CochlScene-audio/*.mp3')
len(audio_files)

76115

In [41]:
!du -hs CochlScene-audio

3.3G	CochlScene-audio


In [42]:
import zipfile

with zipfile.ZipFile('CochlScene-audio.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [43]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions CochlScene-audio.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 3.32G/3.32G [00:54<00:00, 61.2MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/CochlScene-audio.zip


In [44]:
!rm CochlScene-audio.zip