In [15]:
from glob import glob
import os
import soundfile as sf
import librosa
from tqdm import tqdm
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [6]:
files = glob('Emotion Speech Dataset/*/*/*.wav')
len(files)

35000

In [10]:
emotions = list(set([f.split('/')[-2] for f in files]))
emotions = str(emotions)
emotions

"['Neutral', 'Surprise', 'Sad', 'Angry', 'Happy']"

In [13]:
!mkdir esd-emotion

In [21]:
import random

questions = [
    'given the labels\n{labels}\n\nclassify the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

def loop(files):
    files, _ = files
    data = []
    for f in tqdm(files):
        y, sr = librosa.load(f)
        label = f.split('/')[-2]
        splitted = '_'.join(f.split('/')[-2:]).replace('.wav', '.mp3')
        audio_filename = os.path.join('esd-emotion', splitted)
        if not os.path.exists(audio_filename):
            sf.write(audio_filename, y, sr)
        
        data.append({
            'question': random.choice(questions).format(labels = emotions),
            'answer': label,
            'metadata': '',
            'audio_filename': audio_filename,
        })
    return data

In [22]:
processed = multiprocessing(files, loop, cores = 20)

100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:03<00:00, 493.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:03<00:00, 478.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:04<00:00, 432.34it/s]
 75%|█████████████████████████████████████████████████████████████▊                    | 1319/1750 [00:04<00:00, 813.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:04<00:00, 381.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:04<00:00, 369.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:04<00:00, 355.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 1750/1750 [00:05<00:00, 345.63it/s]


In [23]:
from datasets import Dataset

dataset = Dataset.from_list(processed)

In [24]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'esd_emotion')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/35 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/285k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.56k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/59de091202185bb75163cc0d7ed8c252f7202119', commit_message='Upload dataset', commit_description='', oid='59de091202185bb75163cc0d7ed8c252f7202119', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [25]:
from glob import glob

audio_files = glob('esd-emotion/*.mp3')
len(audio_files)

35000

In [26]:
!du -hs esd-emotion

662M	esd-emotion


In [27]:
import zipfile

with zipfile.ZipFile('esd-emotion.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [28]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions esd-emotion.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|██████████████████████████| 610M/610M [00:25<00:00, 24.0MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/esd-emotion.zip


In [30]:
!rm -rf 'Emotion Speech Dataset'