In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="W4ng1204/Nonspeech7k",
    repo_type="dataset",
    local_dir = './Nonspeech7k'
)

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

test.parquet:   0%|          | 0.00/123M [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.46k [00:00<?, ?B/s]

'/home/mesolitica/stt/Nonspeech7k'

In [29]:
!rm -rf nonspeech-7k
!mkdir nonspeech-7k

In [30]:
from tqdm import tqdm
import random
import json
import os
import pandas as pd

questions = [
    'given the labels\n{labels}\n\nwhat is the label for the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

df = pd.read_parquet('Nonspeech7k/train.parquet')
unique_labels = str(df['classname'].unique().tolist())

train = []
for i in tqdm(range(len(df))):
    metadata = df.iloc[i].to_dict()
    metadata.pop('audio')
    audio_filename = os.path.join('nonspeech-7k', f'{i}.wav')
    with open(audio_filename, 'wb') as fopen:
        fopen.write(df['audio'].iloc[i]['bytes'])
    
    train.append({
        'question': random.choice(questions).format(labels = unique_labels),
        'answer': df['classname'].iloc[i],
        'audio_filename': audio_filename,
        'metadata': json.dumps(metadata)
    })

100%|█████████████████████████████████████████████████████████████████████████████████| 6289/6289 [00:01<00:00, 4429.06it/s]


In [31]:
df = pd.read_parquet('Nonspeech7k/test.parquet')

test = []
for i in tqdm(range(len(df))):
    metadata = df.iloc[i].to_dict()
    metadata.pop('audio')
    audio_filename = os.path.join('nonspeech-7k', f'test-{i}.wav')
    with open(audio_filename, 'wb') as fopen:
        fopen.write(df['audio'].iloc[i]['bytes'])
    
    test.append({
        'question': random.choice(questions).format(labels = unique_labels),
        'answer': df['classname'].iloc[i],
        'audio_filename': audio_filename,
        'metadata': json.dumps(metadata)
    })

100%|███████████████████████████████████████████████████████████████████████████████████| 725/725 [00:00<00:00, 5029.23it/s]


In [32]:
len(train), len(test)

(6289, 725)

In [33]:
from datasets import Dataset

dataset = Dataset.from_list(train)

In [34]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'nonspeech7k_train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/225k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/46faec8dad174d5fb53d934e82ea2b422a466858', commit_message='Upload dataset', commit_description='', oid='46faec8dad174d5fb53d934e82ea2b422a466858', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [35]:
dataset = Dataset.from_list(test)

In [36]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'nonspeech7k_test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/31.1k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/c60cbe3003608dd520c54cc00a3b6d33d41d9a3b', commit_message='Upload dataset', commit_description='', oid='c60cbe3003608dd520c54cc00a3b6d33d41d9a3b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [38]:
from glob import glob

audio_files = glob('nonspeech-7k/*.wav')
len(audio_files)

7014

In [39]:
import zipfile
with zipfile.ZipFile('nonspeech-7k.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [40]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions nonspeech-7k.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 1.18G/1.18G [00:25<00:00, 46.4MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/nonspeech-7k.zip


In [41]:
!rm nonspeech-7k.zip