In [5]:
# !wget https://huggingface.co/datasets/MahiA/VocalSound/resolve/main/VocalSound.zip
# !unzip VocalSound.zip
!rm VocalSound.zip

In [8]:
!ls -lha VocalSound

total 1.6M
drwxrwxrwx   4 mesolitica mesolitica 4.0K Nov  1  2024 .
drwxrwxr-x 131 mesolitica mesolitica  20K Jun  8 11:16 ..
drwxrwxrwx   2 mesolitica mesolitica 872K Sep 28  2024 audios
drwxrwxrwx   3 mesolitica mesolitica 4.0K Nov  1  2024 .cache
-rw-rw-rw-   1 mesolitica mesolitica 127K Sep 29  2024 test.csv
-rw-rw-rw-   1 mesolitica mesolitica 549K Sep 29  2024 train.csv


In [7]:
import pandas as pd

df = pd.read_csv('VocalSound/train.csv')
df

Unnamed: 0,path,classname
0,audios/m0379_0_laughter.wav,Laughter
1,audios/m0758_0_sigh.wav,Sigh
2,audios/m0446_0_sniff.wav,Sniff
3,audios/f0416_0_throatclearing.wav,Throat clearing
4,audios/f1881_0_sneeze.wav,Sneeze
...,...,...
15526,audios/m2485_0_sniff.wav,Sniff
15527,audios/f0919_0_sniff.wav,Sniff
15528,audios/m2247_0_sneeze.wav,Sneeze
15529,audios/f0146_0_sigh.wav,Sigh


In [11]:
from tqdm import tqdm
import random
import json
import os
import pandas as pd

questions = [
    'given the labels\n{labels}\n\nwhat is the label for the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

df = pd.read_csv('VocalSound/train.csv')
unique_labels = str(df['classname'].unique().tolist())

train = []
for i in tqdm(range(len(df))):
    metadata = df.iloc[i].to_dict()
    audio_filename = os.path.join('VocalSound', df['path'].iloc[i])
    if not os.path.exists(audio_filename):
        continue
    
    train.append({
        'question': random.choice(questions).format(labels = unique_labels),
        'answer': df['classname'].iloc[i],
        'audio_filename': audio_filename,
        'metadata': json.dumps(metadata)
    })

100%|██████████████████████████████████████████████████████████████████████████████| 15531/15531 [00:00<00:00, 21863.93it/s]


In [12]:
train[0]

{'question': "what is the label for audio\n\nthe labels: ['Laughter', 'Sigh', 'Sniff', 'Throat clearing', 'Sneeze', 'Cough']",
 'answer': 'Laughter',
 'audio_filename': 'VocalSound/audios/m0379_0_laughter.wav',
 'metadata': '{"path": "audios/m0379_0_laughter.wav", "classname": "Laughter"}'}

In [13]:
df = pd.read_csv('VocalSound/test.csv')

test = []
for i in tqdm(range(len(df))):
    metadata = df.iloc[i].to_dict()
    audio_filename = os.path.join('VocalSound', df['path'].iloc[i])
    if not os.path.exists(audio_filename):
        continue
    
    test.append({
        'question': random.choice(questions).format(labels = unique_labels),
        'answer': df['classname'].iloc[i],
        'audio_filename': audio_filename,
        'metadata': json.dumps(metadata)
    })

100%|████████████████████████████████████████████████████████████████████████████████| 3591/3591 [00:00<00:00, 21633.57it/s]


In [14]:
len(test)

3591

In [15]:
from datasets import Dataset

dataset = Dataset.from_list(train)

In [16]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vocalsound_train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/402k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/01ea3624fd5bd1ebb37c5fdcef5d2492ee60dba5', commit_message='Upload dataset', commit_description='', oid='01ea3624fd5bd1ebb37c5fdcef5d2492ee60dba5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [17]:
dataset = Dataset.from_list(test)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vocalsound_test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/89.3k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/4ec02c5db168021ec69effcad2a48ff1e2cf4260', commit_message='Upload dataset', commit_description='', oid='4ec02c5db168021ec69effcad2a48ff1e2cf4260', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [18]:
from glob import glob

audio_files = glob('VocalSound/audios/*.wav')
len(audio_files)

21024

In [19]:
import zipfile
with zipfile.ZipFile('vocalsound.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [20]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions vocalsound.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 4.78G/4.78G [01:09<00:00, 68.7MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/vocalsound.zip


In [21]:
!rm vocalsound.zip