In [1]:
from multiprocess import Pool
import itertools
import numpy as np

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [2]:
from datasets import load_dataset
ds = load_dataset("diarizers-community/voxconverse")

print(ds)

DatasetDict({
    dev: Dataset({
        features: ['audio', 'timestamps_start', 'timestamps_end', 'speakers'],
        num_rows: 216
    })
    test: Dataset({
        features: ['audio', 'timestamps_start', 'timestamps_end', 'speakers'],
        num_rows: 232
    })
})




In [5]:
import string
import soundfile as sf
import numpy as np
from collections import defaultdict

def convert_rttm(chunk, filename = 'audio'):
    rttm = []
    for start, end, speaker in chunk:
        duration = end - start
        rttm.append(f"SPEAKER {filename} 1 {start:.4f} {duration:.4f} <NA> <NA> <NA> <NA> {speaker}")
    return '\n'.join(rttm)

def convert_textgrid(segments):
    tiers = defaultdict(list)
    for start, end, speaker in segments:
        tiers[speaker].append((start, end))

    min_time = min(start for start, _, _ in segments)
    max_time = max(end for _, end, _ in segments)

    textgrid = []
    textgrid.append("File type = \"ooTextFile\"")
    textgrid.append("Object class = \"TextGrid\"")
    textgrid.append("")
    textgrid.append(f"xmin = {min_time:.2f}")
    textgrid.append(f"xmax = {max_time:.2f}")
    textgrid.append("tiers? <exists>")
    textgrid.append(f"size = {len(tiers)}")
    textgrid.append("item []:")

    for i, (speaker, intervals) in enumerate(tiers.items(), start=1):
        textgrid.append(f"    item [{i}]:")
        textgrid.append("        class = \"IntervalTier\"")
        textgrid.append(f"        name = \"{speaker}\"")
        textgrid.append(f"        xmin = {min_time:.2f}")
        textgrid.append(f"        xmax = {max_time:.2f}")
        textgrid.append(f"        intervals: size = {len(intervals)}")

        for j, (start, end) in enumerate(intervals, start=1):
            textgrid.append(f"        intervals [{j}]:")
            textgrid.append(f"            xmin = {start:.2f}")
            textgrid.append(f"            xmax = {end:.2f}")
            textgrid.append(f"            text = \"{speaker}\"")
            
    return '\n'.join(textgrid)

timestamps = [i * 0.02 for i in range(1500 + 1)]

In [6]:
# !rm -rf voxconverse
# !mkdir voxconverse

In [7]:
from tqdm import tqdm
import os

def loop(indices):
    indices, _ = indices
    ds = load_dataset("diarizers-community/voxconverse")
    data = []
    for k, key in tqdm(indices):
        row = ds[key][k]
        audio = row['audio']['array']
        chunks, temp = [], []
        argsort = np.argsort(row['timestamps_start'])
        timestamps_start = [row['timestamps_start'][i] for i in argsort]
        timestamps_end = [row['timestamps_end'][i] for i in argsort]
        speakers = [row['speakers'][i] for i in argsort]
        start = timestamps_start[0]
        max_len = 30
        for i in range(len(timestamps_start)):
            l = timestamps_end[i] - start
            if l >= max_len:
                chunks.append(temp)
                temp = [[timestamps_start[i], timestamps_end[i], speakers[i]]]
                start = timestamps_start[i]
                continue
            else:
                temp.append([timestamps_start[i], timestamps_end[i], speakers[i]])

        if len(temp):
            chunks.append(temp)

        for no, chunk in enumerate(chunks):
            speakers = []
            for i in range(len(chunk)):
                if chunk[i][-1] not in speakers:
                    speakers.append(chunk[i][-1])
            
            try:          
                start_time = chunk[0][0]
                end_time = max([c[1] for c in chunk])
            except Exception as e:
                continue
                
            if round(end_time - start_time, 2) > max_len:
                continue
            
            y = audio[int(16000 * start_time): int(16000 * end_time)]
            audio_filename = f'voxconverse/{key}-{k}-{no}.mp3'
            if not os.path.exists(audio_filename):
                sf.write(audio_filename, y, 16000)
            
            ts = []
            for i in range(len(chunk)):
                index = speakers.index(chunk[i][-1])
                start = min(timestamps, key=lambda t: abs(t - (chunk[i][0] - start_time)))
                end = min(timestamps, key=lambda t: abs(t - (chunk[i][1] - start_time)))
                speaker_name = f'speaker {string.ascii_uppercase[index]}'
                chunk[i][-1] = speaker_name
                chunk[i][0] = start
                chunk[i][1] = end
                t = f"<|{start:.2f}|> {speaker_name}<|{end:.2f}|>"
                ts.append(t)
                
            ts = ''.join(ts)
            rttm = convert_rttm(chunk)
            textgrid = convert_textgrid(chunk)
            
            data.append({
                'question': 'diarize the audio using whisper format',
                'answer': ts,
                'audio_filename': audio_filename,
            })
            data.append({
                'question': 'diarize the audio using rttm format',
                'answer': rttm,
                'audio_filename': audio_filename,
            })
            data.append({
                'question': 'diarize the audio using textgrid format',
                'answer': textgrid,
                'audio_filename': audio_filename,
            })
            
    return data

In [8]:
indices = list(range(len(ds['dev'])))
indices = [(i, 'dev') for i in indices]
prepared_validation = multiprocessing(indices, loop, cores = min(len(indices), 20))

100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.36it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.72it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.08it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.72it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  6.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.38it/s]


In [9]:
indices = list(range(len(ds['test'])))
indices = [(i, 'test') for i in indices]
prepared_test = multiprocessing(indices, loop, cores = min(len(indices), 20))

100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:01<00:00,  5.71it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.75it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.30it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.81it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.88it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  4.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:02<00:00,  5.12it/s]


In [10]:
len(prepared_validation), len(prepared_test)

(6564, 13959)

In [11]:
prepared_validation[-2]

{'question': 'diarize the audio using rttm format',
 'answer': 'SPEAKER audio 1 0.0000 9.9200 <NA> <NA> <NA> <NA> speaker A\nSPEAKER audio 1 9.9600 1.1200 <NA> <NA> <NA> <NA> speaker B\nSPEAKER audio 1 11.7200 12.1200 <NA> <NA> <NA> <NA> speaker B',
 'audio_filename': 'voxconverse/dev-215-4.mp3'}

In [12]:
import pandas as pd

pd.DataFrame(prepared_validation).to_parquet('voxconverse-validation.parquet')
pd.DataFrame(prepared_test).to_parquet('voxconverse-test.parquet')

In [13]:
!huggingface-cli upload mesolitica/Speaker-Diarization-Instructions \
voxconverse-validation.parquet /data/voxconverse_validation-00000-of-00001.parquet \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|██████████████████████████| 424k/424k [00:06<00:00, 68.2kB/s]
https://huggingface.co/datasets/mesolitica/Speaker-Diarization-Instructions/blob/main//data/voxconverse_validation-00000-of-00001.parquet


In [14]:
!huggingface-cli upload mesolitica/Speaker-Diarization-Instructions \
voxconverse-test.parquet /data/voxconverse_test-00000-of-00001.parquet \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|█████████████████████████| 1.02M/1.02M [00:05<00:00, 203kB/s]
https://huggingface.co/datasets/mesolitica/Speaker-Diarization-Instructions/blob/main//data/voxconverse_test-00000-of-00001.parquet
