In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/AudioSet-Audio-Adversarial-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./AudioSet-Audio-Adversarial-Instructions",
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/6.39M [00:00<?, ?B/s]

'/home/mesolitica/stt/AudioSet-Audio-Adversarial-Instructions'

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [3]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [5]:
files = sorted(glob('AudioSet-Audio-Adversarial-Instructions/data/*.parquet'))
files

['AudioSet-Audio-Adversarial-Instructions/data/train-00000-of-00001.parquet']

In [11]:
df = pd.read_parquet(files[0]).to_dict(orient = 'records')

In [14]:
def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
            
        new_f = r['audio_filename']
        if not os.path.exists(new_f):
            continue

        try:
            conversation = [
                {'role': 'system', 'content': 'you are an assistant to classify an audio, only reply Yes / No only.'},
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue

        data.append({
            'text': text,
            'audio': new_f,
        })
    return data

In [15]:
processed = loop((df[:100], 0))

100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 5373.59it/s]


In [16]:
processed[-1]

{'text': '<|im_start|>system\nyou are an assistant to classify an audio, only reply Yes / No only.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ncan you hear carnatic music sound in the audio<|im_end|>\n<|im_start|>assistant\nNo<|im_end|>\n',
 'audio': 'AudioSet-Audio-Instructions-audio/500k_part1_nonspeech-00000-of-00009-99.mp3'}

In [17]:
import IPython.display as ipd
ipd.Audio(processed[-1]['audio']) 

In [20]:
processed = multiprocessing(df, loop, cores = min(len(df), 30))

100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5386.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5399.41it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5585.14it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5472.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5263.11it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5553.03it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5562.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 10429/10429 [00:01<00:00, 5547.93it/s]


In [21]:
len(processed)

312883

In [22]:
with open('prepare-AudioSet-Audio-Adversarial-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)