In [1]:
from huggingface_hub import hf_hub_download

f = hf_hub_download(
    repo_id="mesolitica/MusicBench-Instructions",
    repo_type='dataset',
    filename="data/train-00000-of-00001.parquet",
    local_dir="./MusicBench-Instructions")

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [3]:
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [5]:
import pandas as pd

df = pd.read_parquet(f).to_dict(orient = 'records')
df[0]

{'question': 'can u describe the audio',
 'answer': 'This mellow instrumental track showcases a dominant electric guitar that opens with a descending riff, followed by arpeggiated chords, hammer-ons, and a slide. The percussion section keeps it simple with rim shots and a common time count, while the bass adds a single note on the first beat of every bar. Minimalist piano chords round out the song while leaving space for the guitar to shine. There are no vocals, making it perfect for a coffee shop or some chill background music. The key is in E major, with a chord progression that centers around that key and a straightforward 4/4 time signature.',
 'audio_filename': 'datashare/data_aug2/-0SdAVK79lg_1.wav',
 'metadata': '{"dataset": "MusicBench", "location": "data_aug2/-0SdAVK79lg_1.wav", "main_caption": "This mellow instrumental track showcases a dominant electric guitar that opens with a descending riff, followed by arpeggiated chords, hammer-ons, and a slide. The percussion section k

In [6]:
conversation = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "audio.wav"},
        {"type": "text", "text": df[0]['question']},
    ]},
    {"role": "assistant", "content": df[0]['answer']},
]

In [7]:
processor.apply_chat_template(conversation, tokenize=False)

'<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ncan u describe the audio<|im_end|>\n<|im_start|>assistant\nThis mellow instrumental track showcases a dominant electric guitar that opens with a descending riff, followed by arpeggiated chords, hammer-ons, and a slide. The percussion section keeps it simple with rim shots and a common time count, while the bass adds a single note on the first beat of every bar. Minimalist piano chords round out the song while leaving space for the guitar to shine. There are no vocals, making it perfect for a coffee shop or some chill background music. The key is in E major, with a chord progression that centers around that key and a straightforward 4/4 time signature.<|im_end|>\n'

In [8]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [9]:
processed = multiprocessing(df, loop, cores = 30)

100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5618.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5391.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5659.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5645.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5634.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5629.11it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5618.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5039/5039 [00:00<00:00, 5561.78it/s]


In [10]:
len(processed)

151197

In [11]:
with open('prepare-MusicBench-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)

In [12]:
processed[0]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ncan u describe the audio<|im_end|>\n<|im_start|>assistant\nThis mellow instrumental track showcases a dominant electric guitar that opens with a descending riff, followed by arpeggiated chords, hammer-ons, and a slide. The percussion section keeps it simple with rim shots and a common time count, while the bass adds a single note on the first beat of every bar. Minimalist piano chords round out the song while leaving space for the guitar to shine. There are no vocals, making it perfect for a coffee shop or some chill background music. The key is in E major, with a chord progression that centers around that key and a straightforward 4/4 time signature.<|im_end|>\n',
 'audio': 'datashare/data_aug2/-0SdAVK79lg_1.wav'}