In [1]:
import torch
import json
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
    
def new_path(f):
    return f.replace('_processed/', '_processed_trim_dac/').replace('.mp3', '.dac')

def new_path_audio(f):
    return f.replace('_processed/', '_processed_trim/')

with open('config.json') as fopen:
    config = json.load(fopen)
    
text_length = config['data']['text_length']
audio_length = config['data']['audio_length']
codebook_size = config['data']['channels']

max_text = config['data']['text_length']
pad_tok = config['data']['text_pad_value']
max_audio = config['data']['audio_length']

In [2]:
dataset = load_dataset('mesolitica/Malaysian-Emilia-Sesame')['train']



In [3]:
def loop(indices):
    indices, _ = indices
    lengths = []
    
    dataset = load_dataset('mesolitica/Malaysian-Emilia-Sesame')['train']
    for i in tqdm(indices):
        data = dataset[i]
        reference_audio = data['reference_audio'] 
        reference_text = data['reference_text']
        target_audio = data['target_audio']
        target_text = data['target_text']
        text = f'[S1] {reference_text}[S1] {target_text}'
        encoder_l = len(list(text.encode('utf-8')))
        files = [reference_audio, target_audio]
        decoder_l = 0
        for f in files:
            new_f = new_path(f)
            with open(new_f) as fopen:
                d = json.load(fopen)
            d = np.array(d)
            if d.shape[1] != codebook_size:
                d = d.T
            decoder_l += d.shape[0]
    
        lengths.append({
            'i': i,
            'encoder_l': encoder_l,
            'decoder_l': decoder_l
        })
    return lengths

In [4]:
lengths = loop((range(10), 0))

100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 991.66it/s]


In [5]:
lengths = multiprocessing(range(len(dataset)), loop, cores = 20)

100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:47<00:00, 883.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:51<00:00, 870.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:51<00:00, 867.94it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:51<00:00, 870.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:50<00:00, 872.05it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:52<00:00, 866.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:50<00:00, 872.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 201343/201343 [03:53<00:00, 863.46it/s]


In [6]:
rows = sorted(lengths, key = lambda x: x['decoder_l'])

In [7]:
rows[-1]

{'i': 4022448, 'encoder_l': 940, 'decoder_l': 5168}

In [8]:
maxlen = 4096
maxlen_encoder = 0
data, temp, l, l_encoder = [], [], 0, 0
for r in tqdm(rows):
    if r['decoder_l'] > maxlen:
        continue
        
    if l + r['decoder_l'] >= maxlen:
        data.append(temp)
        temp = [r['i']]
        maxlen_encoder = max(maxlen_encoder, l_encoder)
        l = r['decoder_l']
        l_encoder = r['encoder_l']
    else:
        l += r['decoder_l']
        l_encoder += r['encoder_l']
        temp.append(r['i'])

100%|█████████████████████████████████████████████████████████████████████████| 4026870/4026870 [00:10<00:00, 382400.75it/s]


In [9]:
maxlen_encoder

1576

In [10]:
len(data)

1983156

In [11]:
dataset[data[-1][0]]

{'reference_audio': 'dialects_processed/14 NOVEMBER 2024 - BERITA PAGI SARAWAK [AWYuJFkJm5Q]/14 NOVEMBER 2024 - BERITA PAGI SARAWAK [AWYuJFkJm5Q]_9.mp3',
 'reference_text': 'pelajar mengikut kaum bagi kedua YSISS Petra Jaya dan YSISS Kuching adalah terdiri daripada Melayu 36.2%, Bidayah 19.1%, Iban 18.7%, Cina 12.2%, Melanau 7.9%, Orang Ulu 3.7%, dan 2.2% daripada pelbagai kaum yang lain. Empat lagi YSISS akan dibuka, iaitu di Sibu, Betong, Miri, dan Bintulu',
 'target_audio': 'dialects_processed/14 NOVEMBER 2024 - BERITA PAGI SARAWAK [AWYuJFkJm5Q]/14 NOVEMBER 2024 - BERITA PAGI SARAWAK [AWYuJFkJm5Q]_12.mp3',
 'target_text': 'Agensi Anti Dadah Kebangsaan AADK Sarawak dan Lembaga Sumber Asli dan Alam Sekitar NREB telah mengadakan operasi gerak kebangsaan semalam. Problem diadakan di stesen penguatkuasaan JPJ Jalan Kucing Serian itu melibatkan sekatan jalan raya, pemeriksaan pelepasan asap,'}

In [15]:
import json

with open('merged-dia-4096.json', 'w') as fopen:
    json.dump(data, fopen)

In [16]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="merged-dia-4096.json",
    path_in_repo="merged-dia-4096.json",
    repo_id="mesolitica/Malaysian-Emilia-Audio-Tokens",
    repo_type="dataset",
)

merged-dia-4096.json:   0%|          | 0.00/38.9M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-Audio-Tokens/commit/e9df12dc61fa266982f2856c0ac359ae71963732', commit_message='Upload merged-dia-4096.json with huggingface_hub', commit_description='', oid='e9df12dc61fa266982f2856c0ac359ae71963732', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-Audio-Tokens', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-Emilia-Audio-Tokens'), pr_revision=None, pr_num=None)