In [1]:
from glob import glob
from tqdm import tqdm
import json
import numpy as np
import os
from transformers import AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
files = sorted(glob('output/*.json'), key = lambda x: int(x.split('-')[1].replace('.json', '')))
len(files)

42729

In [4]:
!du -hs output-audio

328G	output-audio


In [5]:
# import IPython.display as ipd
# ipd.Audio('output-audio/3-5833-0.mp3')

In [6]:
# for f in tqdm(files):
#     with open(f) as fopen:
#         data = json.load(fopen)

In [21]:
import mp
import copy

def loop(files):
    files, _ = files
    results = []
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
        except:
            continue
        f_split = os.path.split(f)[-1].replace('.json', '')
        for i in range(len(data)):
            
            audio_filename = os.path.join('output-audio', f'{f_split}-{i}.mp3')
            if not os.path.exists(audio_filename):
                continue
            
            data[i]['audio_filename'] = audio_filename
            data[i]['filename'] = f
            data[i]['i'] = i
            a = np.array(tokenizer.encode(data[i]['predict_ms'], add_special_tokens = False))
            a = a[a != 50257]
            data[i]['predict_ms'] = tokenizer.decode(a.tolist() + [50257])
            a = np.array(tokenizer.encode(data[i]['predict_en'], add_special_tokens = False))
            a = a[a != 50257]
            data[i]['predict_en'] = tokenizer.decode(a.tolist() + [50257])
            
            dense = CountVectorizer(ngram_range = (3,3)).fit_transform([data[i]['predict_ms']]).todense()
            repeat_ms = (dense > 3).sum() > 1
            data[i]['repeat_ms'] = repeat_ms
            
            
            dense = CountVectorizer(ngram_range = (3,3)).fit_transform([data[i]['predict_en']]).todense()
            repeat_en = (dense > 3).sum() > 1
            data[i]['repeat_en'] = repeat_en
            
            results.append(data[i])
    return results

In [22]:
results = mp.multiprocessing(files, loop, cores = 30)

100%|██████████| 1424/1424 [03:38<00:00,  6.51it/s]
100%|██████████| 1424/1424 [03:40<00:00,  6.45it/s]
100%|██████████| 1424/1424 [03:40<00:00,  6.45it/s]
 98%|█████████▊| 1401/1424 [03:40<00:03,  6.13it/s]
100%|██████████| 1424/1424 [03:41<00:00,  6.44it/s]
100%|██████████| 1424/1424 [03:41<00:00,  6.43it/s]
 98%|█████████▊| 1395/1424 [03:41<00:05,  5.52it/s]
100%|██████████| 1424/1424 [03:41<00:00,  6.43it/s]
100%|██████████| 1424/1424 [03:41<00:00,  6.43it/s]
100%|██████████| 1424/1424 [03:41<00:00,  6.42it/s]
100%|██████████| 1424/1424 [03:42<00:00,  6.41it/s]
100%|██████████| 1424/1424 [03:42<00:00,  6.40it/s]
100%|██████████| 1424/1424 [03:42<00:00,  6.39it/s]
 99%|█████████▉| 1410/1424 [03:42<00:02,  6.06it/s]
100%|██████████| 9/9 [00:01<00:00,  6.56it/s]7it/s]
100%|██████████| 1424/1424 [03:43<00:00,  6.37it/s]
100%|██████████| 1424/1424 [03:44<00:00,  6.36it/s]
100%|██████████| 1424/1424 [03:44<00:00,  6.34it/s]
100%|██████████| 1424/1424 [03:44<00:00,  6.33it/s]
100%|███████

In [23]:
len(results)

2221856

In [26]:
results[0]

{'predict_ms': '<|startoftranscript|><|ms|><|transcribe|> anda tahu keuntungan boleh lebih tinggi daripada keuntungan kewangan rumah maka saya tidak akan mencari dalam akaun saya akan mencari ke dalam ethereum atau beberapa crypto punks bergantung pada faktor risiko anda kerana rumah kajang dihantar tidak mengganggu dsr saya sejauh ini jadi sekarang apa posisi saya untuk mendapatkan kewangan ketiga jadi mungkin setelah melihat sekeliling saya menemui seorang penjual yang dapat menutupi perhubungan tetapi bank hanya menerima 70% dari itu saya boleh membayar perbezaan dengan menggunakan wang ini kerana sekali lagi ia menyusahkan saya dan aset tetapi jika anda tidak selesa dengan mencari<|endoftext|>',
 'predict_en': "<|startoftranscript|><|en|><|transcribe|> you know the returns can be higher than the savings of the housing loan interest then i will not put in the account i'll put into ethereum or some crypto punks depending on your risk factor then because of the kajang house being let 

In [29]:
bool(results[0]['repeat_ms'])

False

In [27]:
# import IPython.display as ipd
# ipd.Audio('output-audio/3-165-0.mp3')

In [30]:
with open('pseudolabel.jsonl', 'w') as fopen:
    for r in tqdm(results):
        r['repeat_ms'] = bool(r['repeat_ms'])
        r['repeat_en'] = bool(r['repeat_en'])
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 2221856/2221856 [00:19<00:00, 115335.75it/s]


In [31]:
!ls -lh pseudolabel.jsonl

-rwxrwxrwx 1 ubuntu ubuntu 2.0G Dec 29 08:11 pseudolabel.jsonl


In [32]:
from huggingface_hub import HfApi
api = HfApi()

In [33]:
api.upload_file(
    path_or_fileobj='pseudolabel.jsonl',
    path_in_repo='pseudolabel.jsonl',
    repo_id='mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3',
    repo_type='dataset',
)

pseudolabel.jsonl:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/pseudolabel-malaysian-youtube-whisper-large-v3/blob/main/pseudolabel.jsonl'