In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/AudioSet-Audio-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./AudioSet-Audio-Instructions",
)

Fetching 32 files:   0%|          | 0/32 [00:00<?, ?it/s]

'/home/mesolitica/stt/AudioSet-Audio-Instructions'

In [3]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [4]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor



In [5]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [6]:
files = sorted(glob('AudioSet-Audio-Instructions/data/*.parquet'))
files

['AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00000-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00001-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00002-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00003-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00004-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00005-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00006-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00007-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_nonspeech-00008-of-00009.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_speech-00000-of-00007.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_speech-00001-of-00007.parquet',
 'AudioSet-Audio-Instructions/data/500k_part1_speech-00002-of-00007.parquet',
 'AudioSet-Audio-Instructions/data/50

In [7]:
!mkdir AudioSet-Audio-Instructions-audio

In [8]:
def loop(files):
    files, _ = files
    data = []
    for f in files:
        f_only = os.path.split(f)[1].replace('.parquet', '')
        df = pd.read_parquet(f)
        for i in tqdm(range(len(df))):
            new_f = os.path.join('AudioSet-Audio-Instructions-audio', 
                                 f'{f_only}-{i}.mp3')
            with open(new_f, 'wb') as fopen:
                fopen.write(df['audio_filename'].iloc[i]['bytes'])
            
            try:
                conversation = [
                    {"role": "user", "content": [
                        {"type": "audio", "audio_url": "audio.wav"},
                        {"type": "text", "text": df['question'].iloc[i]},
                    ]},
                    {"role": "assistant", "content": df['answer'].iloc[i]},
                ]
                text = processor.apply_chat_template(conversation, tokenize=False)
            except Exception as e:
                continue
            
            data.append({
                'text': text,
                'audio': new_f,
            })
    return data

In [9]:
processed = loop((files[:1], 0))

100%|███████████████████████████████████████████████████████████████████████████████| 10269/10269 [00:02<00:00, 3527.37it/s]


In [10]:
len(processed)

10269

In [11]:
processed[-1]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat are the prominent sounds in the audio?<|im_end|>\n<|im_start|>assistant\nThe audio captures the sounds of a boat or water vehicle, likely a motorboat or speedboat, moving through the water. You can hear the engine hum and the rush of wind, which is creating a noticeable wind noise picked up by the microphone.<|im_end|>\n',
 'audio': 'AudioSet-Audio-Instructions-audio/500k_part1_nonspeech-00000-of-00009-10268.mp3'}

In [12]:
import IPython.display as ipd
ipd.Audio(processed[-1]['audio']) 

In [13]:
processed = multiprocessing(files, loop, cores = min(len(files), 30))

100%|███████████████████████████████████████████████████████████████████████████████| 10269/10269 [00:04<00:00, 2106.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8995/8995 [00:04<00:00, 1953.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8995/8995 [00:04<00:00, 1946.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9086/9086 [00:04<00:00, 1955.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9086/9086 [00:04<00:00, 1943.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8995/8995 [00:04<00:00, 1916.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9086/9086 [00:04<00:00, 1931.05it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9086/9086 [00:04<00:00, 1941.29it/s]


In [14]:
len(processed)

312883

In [16]:
with open('prepare-AudioSet-Audio-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)