In [1]:
import soundfile as sf
import subprocess
from glob import glob
import os
from multiprocess import Pool
import itertools
from tqdm import tqdm

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
    
def convert_mp4_to_mp3(input_file, output_file, sample_rate=16000):
    """
    # Example usage
    # convert_mp4_to_mp3('input.mp4', 'output.mp3')
    """
    cmd = [
        'ffmpeg',
        '-i', input_file,
        '-vn',
        '-ar', str(sample_rate),
        '-ac', '1',
        '-b:a', '48k',
        output_file
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)


def loop(files):
    files, _ = files
    for f in tqdm(files):
        try:
            new_f = os.path.join('vggsound-mp3', os.path.split(f)[1].replace('.mp4', '.mp3'))
            convert_mp4_to_mp3(f, new_f)
        except:
            pass

In [2]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_id="Loie/VGGSound",
#     repo_type="dataset",
#     allow_patterns=[
#         '*.tar.gz', 
#     ],
#     local_dir = './VGGSound'
# )

In [3]:
# from glob import glob
# import os

# files = glob('VGGSound/*.tar.gz')
# for f in files:
#     print(f)
#     os.system(f'tar -zxf {f}')

In [4]:
# !rm -rf vggsound-mp3
# !mkdir vggsound-mp3

In [5]:
# files = glob('scratch/shared/beegfs/hchen/train_data/VGGSound_final/video/*.mp4')
# len(files)

197957

In [7]:
# multiprocessing(files, loop, cores = 50)

 33%|███████████████████████████▏                                                       | 1296/3959 [13:29<28:16,  1.57it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|█████████████████████████████████████████████████████████████▎                     | 2922/3959 [30:31<12:57,  1.33it/s]

In [10]:
!du -hs vggsound-mp3

12G	vggsound-mp3


In [12]:
# !wget https://huggingface.co/datasets/Loie/VGGSound/resolve/main/vggsound.csv

In [30]:
import pandas as pd

df = pd.read_csv('vggsound.csv', header = None)
df.shape

(199467, 4)

In [31]:
unique_labels = str(df[2].unique().tolist())

In [32]:
questions = [
    'given the labels\n{labels}\n\nwhat is the label for the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

In [48]:
from collections import defaultdict
import random
import json

vggsound = defaultdict(list)

for i in tqdm(range(len(df))):
    audio_filename = os.path.join('vggsound-mp3', f"{df[0].iloc[i]}_{int(df[1].iloc[i]):06d}.mp3")
    if not os.path.exists(audio_filename):
        continue
    split = df[3].iloc[i]
    d = {
        'question': random.choice(questions).format(labels = unique_labels),
        'answer': df[2].iloc[i],
        'audio_filename': audio_filename,
        'metadata': json.dumps(df.iloc[i].to_dict())
    }
    vggsound[split].append(d)

100%|████████████████████████████████████████████████████████████████████████████| 199467/199467 [00:16<00:00, 12388.96it/s]


In [52]:
vggsound['test'][10]

{'question': "what is the label for audio\n\nthe labels: ['people marching', 'waterfall burbling', 'playing tennis', 'people belly laughing', 'car engine starting', 'alarm clock ringing', 'female speech, woman speaking', 'cricket chirping', 'wind noise', 'foghorn', 'people battle cry', 'playing volleyball', 'female singing', 'playing harpsichord', 'male speech, man speaking', 'playing bassoon', 'playing piano', 'people clapping', 'bee, wasp, etc. buzzing', 'baby babbling', 'people whispering', 'coyote howling', 'metronome', 'playing harp', 'airplane', 'rope skipping', 'ambulance siren', 'people coughing', 'pheasant crowing', 'bird wings flapping', 'cap gun shooting', 'child singing', 'race car, auto racing', 'male singing', 'playing bass guitar', 'playing violin, fiddle', 'playing bongo', 'playing erhu', 'tap dancing', 'playing electronic organ', 'playing congas', 'subway, metro, underground', 'car passing by', 'orchestra', 'playing acoustic guitar', 'hammering nails', 'duck quacking',

In [53]:
from datasets import Dataset

dataset = Dataset.from_list(vggsound['train'])

In [55]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vggsound_train')

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]

Creating parquet from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/2.99M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/943 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/6c1456f8f682ed49d8fb1bc55f21e54f895dd21e', commit_message='Upload dataset', commit_description='', oid='6c1456f8f682ed49d8fb1bc55f21e54f895dd21e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [56]:
dataset = Dataset.from_list(vggsound['test'])
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'vggsound_test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/839k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/9c1a4c880b7a1bf885fba8377ed9f6707f7d1526', commit_message='Upload dataset', commit_description='', oid='9c1a4c880b7a1bf885fba8377ed9f6707f7d1526', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [57]:
audio_files = glob('vggsound-mp3/*.mp3')
len(audio_files)

197956

In [58]:
import zipfile
with zipfile.ZipFile('vggsound.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [59]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions vggsound.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|███████████████████████▉| 11.7G/11.7G [02:41<00:00, 72.3MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/vggsound.zip


In [60]:
!rm vggsound.zip