In [1]:
from glob import glob
from tqdm import tqdm
import json
import numpy as np
import os
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
files = sorted(glob('output/*.json'), key = lambda x: int(x.split('-')[1].replace('.json', '')))
len(files)

100721

In [4]:
!du -hs output-audio

62G	output-audio


In [5]:
import mp
import copy

def loop(files):
    files, _ = files
    results = []
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
        except:
            continue
        f_split = os.path.split(f)[-1].replace('.json', '')
        for i in range(len(data)):
            
            audio_filename = os.path.join('output-audio', f'{f_split}-{i}.mp3')
            if not os.path.exists(audio_filename):
                continue
            
            data[i]['audio_filename'] = audio_filename
            data[i]['filename'] = f
            data[i]['i'] = i
            a = np.array(tokenizer.encode(data[i]['predict_ms'], add_special_tokens = False))
            a = a[a != 50257]
            data[i]['predict_ms'] = tokenizer.decode(a.tolist() + [50257])
            
            dense = CountVectorizer(ngram_range = (3,3)).fit_transform([data[i]['predict_ms']]).todense()
            repeat_ms = (dense > 3).sum() > 1
            data[i]['repeat_ms'] = repeat_ms
            
            results.append(data[i])
    return results

In [6]:
r = loop((files[:1], 0))

100%|██████████| 1/1 [00:00<00:00, 17.45it/s]


In [8]:
results = mp.multiprocessing(files, loop, cores = 30)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [9]:
len(results)

1611536

In [10]:
with open('pseudolabel.jsonl', 'w') as fopen:
    for r in tqdm(results):
        r['repeat_ms'] = bool(r['repeat_ms'])
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 1611536/1611536 [00:08<00:00, 188591.48it/s]


In [12]:
!head -n 1 pseudolabel.jsonl

{"predict_ms": "<|startoftranscript|><|ms|><|transcribe|> Insurans perjalanan adalah penting ketika anda ingin berjudi di luar negara.<|endoftext|>", "score_ms": 7.3125, "audio_filename": "output-audio/0-0-0.mp3", "filename": "output/0-0.json", "i": 0, "repeat_ms": false}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# import IPython.display as ipd
# ipd.Audio('output-audio/0-0-0.mp3')

In [2]:
from huggingface_hub import HfApi
api = HfApi()

In [16]:
api.upload_file(
    path_or_fileobj='pseudolabel.jsonl',
    path_in_repo='pseudolabel.jsonl',
    repo_id='mesolitica/pseudolabel-malaya-speech-stt-train-whisper-large-v3',
    repo_type='dataset',
)

pseudolabel.jsonl:   0%|          | 0.00/454M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/pseudolabel-malaya-speech-stt-train-whisper-large-v3/commit/c73339ac2ecdcf1d632400236027ef31b1466a86', commit_message='Upload pseudolabel.jsonl with huggingface_hub', commit_description='', oid='c73339ac2ecdcf1d632400236027ef31b1466a86', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
# !sudo apt install p7zip-full p7zip-rar -y

In [20]:
!7z -v10g a output-audio.7z output-audio > /dev/null

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
files = sorted(glob('*.7z*'))
files

['output-audio.7z.001',
 'output-audio.7z.002',
 'output-audio.7z.003',
 'output-audio.7z.004',
 'output-audio.7z.005']

In [4]:
for f in files:
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/pseudolabel-malaya-speech-stt-train-whisper-large-v3',
        repo_type='dataset',
    )

output-audio.7z.001
output-audio.7z.002


output-audio.7z.002:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

output-audio.7z.003


output-audio.7z.003:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

output-audio.7z.004


output-audio.7z.004:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

output-audio.7z.005


output-audio.7z.005:   0%|          | 0.00/8.23G [00:00<?, ?B/s]