In [1]:
from glob import glob
from tqdm import tqdm
import json
import numpy as np
import os
from transformers import AutoTokenizer, WhisperConfig
from sklearn.feature_extraction.text import CountVectorizer

config = WhisperConfig.from_pretrained('openai/whisper-large-v3')
maxlen = config.max_length - 3

In [2]:
tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
files = sorted(glob('output/*.json'), key = lambda x: int(x.split('-')[1].replace('.json', '')))
len(files)

52908

In [13]:
import mp
import copy

minimum_score = 6

def loop(files):
    files, _ = files
    results = []
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
        except:
            continue
        f_split = os.path.split(f)[-1].replace('.json', '')
        for i in range(len(data)):
            
            audio_filename = os.path.join('output-audio', f'{f_split}-{i}.mp3')
            if not os.path.exists(audio_filename):
                continue
            
            results.append((audio_filename, data[i]))

    return results

In [14]:
results = mp.multiprocessing(files, loop, cores = 50)

100%|██████████| 1058/1058 [00:06<00:00, 161.00it/s]
100%|██████████| 1058/1058 [00:06<00:00, 161.29it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.74it/s]
100%|██████████| 1058/1058 [00:06<00:00, 160.49it/s]
100%|██████████| 1058/1058 [00:06<00:00, 160.19it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.44it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.95it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.70it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.35it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.70it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.65it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.34it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.34it/s]
100%|██████████| 1058/1058 [00:06<00:00, 157.60it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.88it/s]
100%|██████████| 1058/1058 [00:06<00:00, 157.79it/s]
100%|██████████| 1058/1058 [00:06<00:00, 158.17it/s]
100%|██████████| 1058/1058 [00:06<00:00, 159.12it/s]
100%|██████████| 1058/1058 [00:06<00:00, 157.3

In [15]:
len(results)

2222052

In [16]:
results[0]

('output-audio/2-0-0.mp3',
 {'predict_ms': [50258,
   50282,
   50360,
   50365,
   4586,
   455,
   6801,
   991,
   25835,
   19377,
   2938,
   50409,
   50409,
   9118,
   25835,
   50434,
   50434,
   591,
   4118,
   3913,
   388,
   5581,
   1606,
   1508,
   9568,
   16434,
   50502,
   50502,
   10715,
   1508,
   9568,
   514,
   50539,
   50539,
   479,
   545,
   335,
   283,
   545,
   335,
   283,
   545,
   335,
   50597,
   50597,
   5707,
   335,
   2604,
   50638,
   50638,
   407,
   290,
   29319,
   5948,
   650,
   2394,
   50708,
   50708,
   460,
   29319,
   5948,
   650,
   2394,
   50790,
   50790,
   5130,
   2419,
   50803,
   50803,
   19158,
   296,
   12,
   7124,
   296,
   3867,
   50853,
   50853,
   5637,
   650,
   2394,
   50894,
   50894,
   33468,
   4714,
   9568,
   514,
   26532,
   50994,
   50994,
   508,
   1459,
   71,
   50996,
   50996,
   508,
   1459,
   71,
   312,
   2394,
   6801,
   51034,
   51034,
   879,
   2394,
   816,
   897,

In [17]:
with open('prepared-pseudolabel-original.jsonl', 'w') as fopen:
    for r in tqdm(results):
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 2222052/2222052 [02:13<00:00, 16627.71it/s]


In [None]:
!zip -r -s 10000M output-audio.zip output-audio

In [18]:
from huggingface_hub import HfApi
api = HfApi()

In [19]:
api.upload_file(
    path_or_fileobj='prepared-pseudolabel-original.jsonl',
    path_in_repo='prepared-pseudolabel-original.jsonl',
    repo_id='mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3-timestamp',
    repo_type='dataset',
)

prepared-pseudolabel-original.jsonl:   0%|          | 0.00/7.53G [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3-timestamp/blob/main/prepared-pseudolabel-original.jsonl'

In [4]:
files = glob('output-audio.z*')
for f in files:
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3-timestamp',
        repo_type='dataset',
    )

output-audio.z12


output-audio.z12:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z09


output-audio.z09:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z20


output-audio.z20:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z23


output-audio.z23:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z10


output-audio.z10:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z11


output-audio.z11:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z08


output-audio.z08:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z07


output-audio.z07:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z30


output-audio.z30:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z19


output-audio.z19:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z18


output-audio.z18:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z14


output-audio.z14:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z15


output-audio.z15:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z29


output-audio.z29:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z04


output-audio.z04:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z24


output-audio.z24:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z13


output-audio.z13:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z28


output-audio.z28:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z25


output-audio.z25:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z03


output-audio.z03:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z26


output-audio.z26:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z22


output-audio.z22:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z32


output-audio.z32:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z05


output-audio.z05:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z16


output-audio.z16:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z27


output-audio.z27:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z21


output-audio.z21:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z02


output-audio.z02:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z01


output-audio.z01:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z31


output-audio.z31:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.z17


output-audio.z17:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

output-audio.zip


output-audio.zip:   0%|          | 0.00/8.03G [00:00<?, ?B/s]

output-audio.z06


output-audio.z06:   0%|          | 0.00/10.5G [00:00<?, ?B/s]