In [1]:
from glob import glob
from tqdm import tqdm
import json
import numpy as np
import os
from transformers import AutoTokenizer, WhisperConfig
from sklearn.feature_extraction.text import CountVectorizer

config = WhisperConfig.from_pretrained('openai/whisper-large-v3')
maxlen = config.max_length - 3

In [2]:
tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
files = sorted(glob('output-mandarin/*.json'), key = lambda x: int(x.split('-')[-1].replace('.json', '')))
len(files)

21056

In [8]:
import mp
import copy

minimum_score = 6

def loop(files):
    files, _ = files
    results = []
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
        except:
            continue
        f_split = os.path.split(f)[-1].replace('.json', '')
        for i in range(len(data)):
            
            results.append(data[i])

    return results

In [9]:
results = mp.multiprocessing(files, loop, cores = 50)

100%|██████████| 421/421 [01:09<00:00,  6.05it/s]
100%|██████████| 421/421 [01:11<00:00,  5.93it/s]
 88%|████████▊ | 370/421 [01:11<00:09,  5.10it/s]
 97%|█████████▋| 408/421 [01:11<00:02,  4.70it/s]
 98%|█████████▊| 412/421 [01:11<00:01,  5.65it/s]
100%|██████████| 421/421 [01:11<00:00,  5.86it/s]
 91%|█████████ | 382/421 [01:12<00:06,  5.70it/s]
100%|██████████| 421/421 [01:12<00:00,  5.80it/s]
 98%|█████████▊| 413/421 [01:12<00:01,  5.14it/s]
 98%|█████████▊| 411/421 [01:12<00:01,  6.23it/s]
 98%|█████████▊| 414/421 [01:12<00:01,  5.75it/s]
100%|██████████| 421/421 [01:12<00:00,  5.77it/s]
100%|██████████| 421/421 [01:12<00:00,  5.77it/s]
100%|██████████| 421/421 [01:13<00:00,  5.77it/s]
100%|██████████| 421/421 [01:13<00:00,  5.76it/s]
100%|██████████| 421/421 [01:13<00:00,  5.75it/s]
100%|██████████| 421/421 [01:13<00:00,  5.75it/s]
100%|██████████| 421/421 [01:13<00:00,  5.75it/s]

 98%|█████████▊| 414/421 [01:13<00:01,  6.53it/s]
100%|██████████| 421/421 [01:13<00:00,  5.73it/s]

In [10]:
len(results)

884352

In [11]:
results[0]

{'predict_zh': [50258,
  50260,
  50360,
  50365,
  21209,
  8225,
  1787,
  253,
  15106,
  8713,
  24302,
  10673,
  250,
  1369,
  100,
  30246,
  1546,
  18464,
  101,
  26748,
  50545,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257,
  50257],
 'score_zh': 9.75,
 'filename': 'data_aishell/wav/train/S0002/BAC009S0002W0207.wav'}

In [12]:
with open('prepared-mandarin-original.jsonl', 'w') as fopen:
    for r in tqdm(results):
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 884352/884352 [00:11<00:00, 75087.92it/s] 


In [12]:
from huggingface_hub import HfApi
api = HfApi()

In [14]:
api.upload_file(
    path_or_fileobj='prepared-mandarin-original.jsonl',
    path_in_repo='prepared-mandarin-original.jsonl',
    repo_id='huseinzol05/pseudolabel-mandarin-large-v3-timestamp',
    repo_type='dataset',
)

prepared-mandarin-original.jsonl:   0%|          | 0.00/587M [00:00<?, ?B/s]

'https://huggingface.co/datasets/huseinzol05/pseudolabel-mandarin-large-v3-timestamp/blob/main/prepared-mandarin-original.jsonl'

In [7]:
import mp
import copy
import re

pattern_pair = r'<\|(\d+\.\d+)\|>(.*?)<\|(\d+\.\d+)\|>'

minimum_score = 6

def loop(files):
    files, _ = files
    results = []
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
        except:
            continue
        f_split = os.path.split(f)[-1].replace('.json', '')
        for i in range(len(data)):
            a = tokenizer._decode_asr(
                [{'tokens': np.array([data[i]['predict_zh']])}], 
                return_timestamps = True, return_language = 'zh', 
                time_precision = 0.02)[1]['chunks']
            
            a = [a_['text'] for a_ in a]

            results.extend(a)

    return results

In [14]:
results = mp.multiprocessing(files, loop, cores = 50)

In [9]:
len(results)

2353369

In [10]:
with open('mandarin-text.texts', 'w') as fopen:
    for t in tqdm(set(results)):
        if not len(t):
            continue
        
        fopen.write(f'{json.dumps(t)}\n')

100%|██████████| 1354211/1354211 [00:01<00:00, 929874.76it/s]


In [11]:
!ls -lh mandarin-text.texts

-rw-r--r-- 1 ubuntu ubuntu 76M Apr 13 04:33 mandarin-text.texts


In [13]:
api.upload_file(
    path_or_fileobj='mandarin-text.texts',
    path_in_repo='mandarin-text.texts',
    repo_id='mesolitica/pseudolabel-mandarin-large-v3-timestamp',
    repo_type='dataset',
)

mandarin-text.texts:   0%|          | 0.00/79.5M [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/pseudolabel-mandarin-large-v3-timestamp/blob/main/mandarin-text.texts'