In [1]:
from huggingface_hub import snapshot_download

f = snapshot_download(
    repo_id="mesolitica/Speaker-Diarization-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Speaker-Diarization-Instructions")

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

data/synthetic_speaker_diarization_datas(…):   0%|          | 0.00/1.89M [00:00<?, ?B/s]

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [3]:
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [5]:
from glob import glob
import pandas as pd

rows = []

files = glob('Speaker-Diarization-Instructions/data/*.parquet')
files = [f for f in files if not ('ami' in f and 'test' in f)]
for f in files:
    df = pd.read_parquet(f).to_dict(orient = 'records')
    rows.extend(df)

In [6]:
len(rows)

98568

In [7]:
rows[0]

{'question': 'diarize the audio using whisper format',
 'answer': '<|0.00|> speaker A<|1.14|><|4.76|> speaker A<|6.66|><|7.32|> speaker B<|7.92|><|8.52|> speaker A<|11.72|><|15.44|> speaker A<|15.92|><|16.30|> speaker B<|18.88|><|20.22|> speaker C<|20.74|><|20.32|> speaker A<|20.78|><|20.42|> speaker B<|22.96|><|22.92|> speaker A<|24.34|><|23.06|> speaker C<|24.24|><|25.90|> speaker C<|27.54|>',
 'audio_filename': 'ami-ihm/train-0-0.mp3'}

In [8]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [9]:
processed = multiprocessing(rows, loop, cores = 30)

100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5504.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5517.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5520.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5500.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5519.52it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5504.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5387.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 3285/3285 [00:00<00:00, 5484.69it/s]


In [10]:
len(processed)

98568

In [11]:
with open('prepare-Speaker-Diarization-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)

In [12]:
processed[0]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ndiarize the audio using whisper format<|im_end|>\n<|im_start|>assistant\n<|0.00|> speaker A<|1.14|><|4.76|> speaker A<|6.66|><|7.32|> speaker B<|7.92|><|8.52|> speaker A<|11.72|><|15.44|> speaker A<|15.92|><|16.30|> speaker B<|18.88|><|20.22|> speaker C<|20.74|><|20.32|> speaker A<|20.78|><|20.42|> speaker B<|22.96|><|22.92|> speaker A<|24.34|><|23.06|> speaker C<|24.24|><|25.90|> speaker C<|27.54|><|im_end|>\n',
 'audio': 'ami-ihm/train-0-0.mp3'}