In [1]:
from huggingface_hub import snapshot_download

f = snapshot_download(
    repo_id="mesolitica/Zeroshot-Audio-Classification-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Zeroshot-Audio-Classification-Instructions")

Fetching 19 files:   0%|          | 0/19 [00:00<?, ?it/s]

data/fsd50k-00000-of-00001.parquet:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

data/esd_emotion-00000-of-00001.parquet:   0%|          | 0.00/285k [00:00<?, ?B/s]

data/gender-00000-of-00001.parquet:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

data/emotion-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

data/CochlScene_test-00000-of-00001.parq(…):   0%|          | 0.00/104k [00:00<?, ?B/s]

data/CochlScene_train-00000-of-00001.par(…):   0%|          | 0.00/936k [00:00<?, ?B/s]

data/age-00000-of-00001.parquet:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

data/CochlScene_val-00000-of-00001.parqu(…):   0%|          | 0.00/99.9k [00:00<?, ?B/s]

data/language-00000-of-00001.parquet:   0%|          | 0.00/12.0M [00:00<?, ?B/s]

data/nonspeech7k_test-00000-of-00001.par(…):   0%|          | 0.00/31.1k [00:00<?, ?B/s]

data/nonspeech7k_train-00000-of-00001.pa(…):   0%|          | 0.00/225k [00:00<?, ?B/s]

data/tau2022-00000-of-00001.parquet:   0%|          | 0.00/5.59M [00:00<?, ?B/s]

data/urbansound8k-00000-of-00001.parquet:   0%|          | 0.00/330k [00:00<?, ?B/s]

data/vggsound_test-00000-of-00001.parque(…):   0%|          | 0.00/839k [00:00<?, ?B/s]

data/vggsound_train-00000-of-00003.parqu(…):   0%|          | 0.00/2.99M [00:00<?, ?B/s]

data/vggsound_train-00001-of-00003.parqu(…):   0%|          | 0.00/2.99M [00:00<?, ?B/s]

data/vggsound_train-00002-of-00003.parqu(…):   0%|          | 0.00/2.99M [00:00<?, ?B/s]

data/vocalsound_test-00000-of-00001.parq(…):   0%|          | 0.00/89.3k [00:00<?, ?B/s]

data/vocalsound_train-00000-of-00001.par(…):   0%|          | 0.00/402k [00:00<?, ?B/s]

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [3]:
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [13]:
from glob import glob
import pandas as pd

rows = []

files = glob('Zeroshot-Audio-Classification-Instructions/data/*.parquet')
files = [f for f in files if 'test' not in f]
for f in files:
    df = pd.read_parquet(f).to_dict(orient = 'records')
    rows.extend(df)

In [14]:
len(rows)

764417

In [15]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [16]:
processed = multiprocessing(rows, loop, cores = 30)

100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5360.93it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5552.72it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5671.32it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5617.59it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5360.95it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5698.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5543.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 25480/25480 [00:04<00:00, 5558.18it/s]


In [17]:
len(processed)

764417

In [18]:
with open('prepare-Zeroshot-Audio-Classification-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)

In [19]:
processed[0]

{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ngiven the labels\n['people marching', 'waterfall burbling', 'playing tennis', 'people belly laughing', 'car engine starting', 'alarm clock ringing', 'female speech, woman speaking', 'cricket chirping', 'wind noise', 'foghorn', 'people battle cry', 'playing volleyball', 'female singing', 'playing harpsichord', 'male speech, man speaking', 'playing bassoon', 'playing piano', 'people clapping', 'bee, wasp, etc. buzzing', 'baby babbling', 'people whispering', 'coyote howling', 'metronome', 'playing harp', 'airplane', 'rope skipping', 'ambulance siren', 'people coughing', 'pheasant crowing', 'bird wings flapping', 'cap gun shooting', 'child singing', 'race car, auto racing', 'male singing', 'playing bass guitar', 'playing violin, fiddle', 'playing bongo', 'playing erhu', 'tap dancing', 'playing electronic organ', 'playing congas', 'subway, metro, underground'