In [2]:
import json
import os
import numpy as np
from collections import defaultdict
from glob import glob
from tqdm import tqdm
import soundfile as sf
import re
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

timestamps = [i * 0.02 for i in range(1500 + 1)]

In [3]:
from huggingface_hub import hf_hub_download
import fasttext

filename = hf_hub_download(
    repo_id="mesolitica/fasttext-language-detection-bahasa-en", 
    filename="fasttext.ftz"
)
lang_model = fasttext.load_model(filename)

In [4]:
def chunk(alignment, reject = -6.5, minimum_length = 1.0):
    alls, temp = [], []
    for a in alignment:
        if a['score'] <= reject:
            if len(temp):
                if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
                    temp[-1]['end'] = float(temp[-1]['end']) + 0.1
                    alls.append(temp)
                temp = []
        else:
            temp.append(a)
            
    if len(temp):
        if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
            temp[-1]['end'] = float(temp[-1]['end']) + 0.1
            alls.append(temp)
    return alls

In [8]:
# !wget https://huggingface.co/datasets/mesolitica/pseudolabel-imda-large-v3-timestamp/resolve/main/prepared-imda.jsonl

In [9]:
data = defaultdict(list)
with open('prepared-imda.jsonl') as fopen:
    for no, l in tqdm(enumerate(fopen)):
        l = json.loads(l)
        data[l['audio_filename']].append((no, l))
        
len(data)

1861125it [00:08, 229570.83it/s]


1861082

In [10]:
rows = list(data.values())
len(rows)

1861082

In [11]:
!rm -rf prepared-imda-chunks
!mkdir prepared-imda-chunks