In [18]:
from glob import glob
from transformers import AutoTokenizer, WhisperConfig
from tqdm import tqdm
import json
import os
import re

def parse_lang(s):
    ori_s = s
    try:
        s = s.split('>')[1].split('<')[0].split(':')[0]
    except:
        pass
    if not len(s):
        s = ori_s
    return s

def cleaning(s):
    return re.sub(r'[ ]+', ' ', s).strip()

replaces = [
    '</P1>',
    '</P2>',
    '</mandarin>',
    '</tamil>',
    '<>',
    '<B>',
    '<EX2>',
    '<EX2><tamil>சனியன்:saniyan</tamil></EX2>',
    '<EX2><tamil>சப்பிட்டு:sapittu</tamil></EX2>',
    '<EX2><tamil>சூத்து:sutthu</tamil></EX2>',
    '<EX2><tamil>சூத்து:suuthu</tamil></EX2>',
    '<EX2><tamil>சூத்தை:suuthai</tamil></EX2>',
    '<FIL/>',
    '<INK>',
    '<NEN>',
    '<NON/>',
    '<NPS/>',
    '<P1>',
    '<P2>',
    '<P2><P2>#Aisyah#</P2></P2>',
    '<S>',
    '<SPK/>',
    '<UKN>',
    '<UNK/>',
    '<UNK>',
    '<UNk>',
    '<YNK>',
    '<Z>',
    '<Z></tamil>',
    '<c/>',
    '<kopi>',
    '<kopis>',
    '<mandarin>',
    '<mandarin>:</mandarin>',
    '<opp>',
    '<ppl>',
    '<s/>',
    '<s>',
    '<tami>',
    '<tamil>',
    '<tamil>:இதுனாலேதான்ithunaalaethaan</tamil>',
    '<tamil>:வெச்சுட்டு:vechuttu</tamil>',
    '<tamil></tamil>',
    '<unk/>',
    '<unk>',
    '<way>',
    '<we>'
    '<malay>',
    '</malay>',
    '_',
    '<UNK>',
    '(ppl)',
    '(ppo)',
    '(mm)',
    '(um)',
    '(ppb)',
    '(mmhmm)',
    '<mandarin>',
    '</mandarin>',
    '(!ee!)',
    '((m)',
    '((mmhmm)m)',
    '((mmhmm)mm)',
    '()',
    '(E)',
    '(Er)',
    '(Erm)',
    '(Err)',
    '(Hm)',
    '(M)',
    '(MM)',
    '(Oh)',
    '(PPO)',
    '(UH)',
    '(UM)',
    '(UNK)',
    '(Uh)',
    '(a)',
    '(ah)',
    '(aha)',
    '(ahh)',
    '(aiya)',
    '(aiyo)',
    '(compassionate)',
    '(d)',
    '(duh)',
    '(ee)',
    '(eer)',
    '(eh)',
    '(ehe)',
    '(emm)',
    '(er)',
    '(erm)',
    '(erp)',
    '(err)',
    '(errr)',
    '(euh)',
    '(ew)',
    '(h)',
    '(ha)',
    '(haa)',
    '(hah)',
    '(har)',
    '(hm)',
    '(hmm)',
    '(hmm)(hmm)',
    '(ho)',
    '(hoo)',
    '(hor)',
    '(hu)',
    '(huh)',
    '(hur)',
    '(ih)',
    '(la)',
    '(lah)',
    '(leh)',
    '(lor)',
    '(m)',
    '(mM)',
    '(mah)',
    '(mhm)',
    '(mm)',
    '(mmhmm)',
    '(mmhmm)(mmhmm)',
    '(mmhmmm)',
    '(mmhmn)',
    '(mmhnn)',
    '(mmm)',
    '(mmmhmmm)',
    '(nah)',
    '(no)',
    '(oh)',
    '(ohoh)',
    '(oi)',
    '(one)',
    '(oo)',
    '(oob)',
    '(ooh)',
    '(or)',
    '(orh)',
    '(ow)',
    '(pare)',
    '(pb)',
    '(pbb)',
    '(pbo)',
    '(pbpb)',
    '(pl)',
    '(pop)',
    '(popb)',
    '(pp0)',
    '(pp;)',
    '(ppO)',
    '(ppb)',
    '(ppc)',
    '(ppl)',
    '(ppll)',
    '(ppo)',
    '(pppo)',
    '(u)',
    '(ugh)',
    '(uh)',
    '(uh-huh)',
    '(uh-oh)',
    '(uhh)',
    '(uhhh)',
    '(uhm)',
    '(uhmmm)',
    '(uhn)',
    '(uhuh)',
    '(um)',
    '(umm)',
    '(urm)',
    '(us)',
    '(uyh)',
    '(wah)',
    '(woah)',
    '(woo)',
    '(wow)',
    '(ya)',
    '(yeah)',
    '(yup)',
    '</P1>',
    '</P2>',
    '</mandarin>',
    '</tamil>',
    '<>',
    '<B>',
    '<EX2>',
    '<EX2><tamil>சனியன்:saniyan</tamil></EX2>',
    '<EX2><tamil>சப்பிட்டு:sapittu</tamil></EX2>',
    '<EX2><tamil>சூத்து:sutthu</tamil></EX2>',
    '<EX2><tamil>சூத்து:suuthu</tamil></EX2>',
    '<EX2><tamil>சூத்தை:suuthai</tamil></EX2>',
    '<FIL/>',
    '<INK>',
    '<NEN>',
    '<NON/>',
    '<NPS/>',
    '<P1>',
    '<P2>',
    '<P2><P2>#Aisyah#</P2></P2>',
    '<S>',
    '<SPK/>',
    '<UKN>',
    '<UNK/>',
    '<UNK>',
    '<UNk>',
    '<YNK>',
    '<Z>',
    '<Z></tamil>',
    '<c/>',
    '<kopi>',
    '<kopis>',
    '<mandarin>',
    '<mandarin>:</mandarin>',
    '<opp>',
    '<ppl>',
    '<s/>',
    '<s>',
    '<tami>',
    '<tamil>',
    '<tamil>:இதுனாலேதான்ithunaalaethaan</tamil>',
    '<tamil>:வெச்சுட்டு:vechuttu</tamil>',
    '<tamil></tamil>',
    '<unk/>',
    '<unk>',
    '<way>',
    '<we>',
    '<c>',
    '(Um)',
    '(Z)',
    "(a'ah)",
    '(aa)',
    '(am)',
    '(em)',
    '(euhr)',
    '(hmmhmm)',
    '(mhmm)',
    '(mmhm)',
    '(pcc)',
    '(poo)',
    '(rr)',
    '(rre)',
    '(ua)',
    '(uerr)',
    '(uhhuh)',
    '(um\\h)',
    '(umh)',
    '(un)',
    '(urr)',
    '(uu)',
    '(woh)',
    '(yh)',
    '(B>', 
    '(uh]', 
    '</mandarin)', 
    '<mmhmm)', 
    '<ppb)'
]

replaces = set(replaces)

replaces_with = {
    '[lah]': 'lah',
    '[ah]': 'ah',
    '[sia]': 'sia',
    '[eh]': 'eh',
    '(uh)': 'uh',
    '[what]': 'what',
    '[oh]': 'oh',
    '(err)': 'err',
    '[lor]': 'lor',
    '[ha]': 'ha',
    '[meh]': 'meh',
    '[one]': 'one',
    "[a'ah]": "a'ah",
    '[hor]': 'hor',
    '[leh]': 'leh',
    '[mah]': 'mah',
    '[nah]': 'nah',
    '[tau]': 'tau',
    '[uh]': 'uh',
    '[wah]': 'wah',
    '[;ah]': 'lah',
     '[;lah]' : 'lah',
     '[AH]': 'lah',
     '[Ah]': 'lah',
     '[Iah]': 'lah',
     '[Oh]': 'oh',
     '[Sia]': 'sia',
     '[Wah]': 'wah',
     '[[h]': '',
     "[a'ah]": "a'ah",
     "[a'ha]": "a'ha",
     '[a-nah]': 'nah',
     '[a]': '',
     '[aah]': 'ah',
     '[ag]': 'ah',
     '[agh]': 'ah',
     '[ah]': 'ah',
     '[aha]': 'aha',
     '[aiya]': 'aiya',
     '[aiyo]': 'aiyo',
     '[aj]': 'ah',
     '[ajh]': 'ah',
     '[alah]': 'alah',
     '[anah]': 'alah',
     '[anor]': '',
     '[arh]': 'arh',
     '[aw]': 'aw',
     '[aww]': 'aw',
     '[ay]': 'ay',
     '[bah]': 'bah',
     '[chey]': 'chey',
     '[da]': 'dah',
     '[dah]': 'dah',
     '[deh]': 'deh',
     '[dey]': 'dey',
     '[do]': 'do',
     '[duh]': 'duh',
     '[e]': '',
     '[ee]': 'ee',
     '[eh]': 'eh',
     '[ehe]': 'ehe',
     '[ehh]': 'eh',
     '[er]': 'er',
     '[err]': 'er',
     '[eww]': 'eww',
     '[h]': '',
     '[ha]': 'ha',
     '[hah]': 'hah',
     '[haha]': 'haha',
     '[har]': 'har',
     '[heh]': 'heh',
     '[hey]': 'hey',
     '[hm]': 'hmm',
     '[hmm]': 'hmm',
     '[ho]': 'hor',
     '[hor]': 'hor',
     '[horh]': 'hor',
     '[hpr]': 'hor',
     '[huh]': 'huh',
     '[hur]': 'huh',
     '[hurhur]': 'huh',
     '[je]': 'jer',
     '[jer]': 'jer',
     '[kan]': 'kan',
     '[l]': 'lah',
     '[la]': 'lah',
     '[lah]': 'lah',
     '[lahh]': 'lah',
     '[le]': 'leh',
     '[leh]': 'leh',
     '[liao]': 'liao',
     '[like]': 'like',
     '[loh]': 'loh',
     '[lor]': 'lor',
     '[lorh]': 'lor',
     '[ma]': 'ma',
     '[mah]': 'mah',
     '[man]': 'man',
     '[meh]': 'meh',
     '[mhm]': 'mhm',
     '[mm]': 'mhm',
     '[mmhmm]': 'mhm',
     '[neh]': 'neh',
     '[ng]': 'ng',
     '[nia]': 'nia',
     '[oh]': 'oh',
     '[oho]': 'oh',
     '[oi]': 'oi',
     '[ok]': 'okay',
     '[okay]': 'okay',
     "[one's]": 'one',
     '[one]': 'one',
     '[ones]': 'one',
     '[oo]': 'oo',
     '[ooh]': 'ooh',
     '[or\\h]': 'orh',
     '[orh]': 'orh',
     '[orh][orh][orh]': 'orh',
     '[oth]': 'orh',
     '[pe]': '',
     '[poh]': 'poh',
     '[ppl]': '',
     '[prh]': '',
     '[pun]': 'pun',
     '[right]': 'right',
     '[roh]': 'roh',
     '[roti]': 'roti',
     '[run]': 'run',
     '[seh]': 'seh',
     '[shiok]': 'shiok',
     '[sia]': 'sia',
     '[siah]': 'sia',
     '[sial]': 'sial',
     '[sialah]': 'sialah',
     '[sian]': 'sian',
     '[sio]': 'siol',
     '[siol]': 'siol',
     '[tau]': 'tau',
     '[that]': 'that',
     '[ugh]': 'ugh',
     '[uh)]': 'ugh',
     '[uh-oh]': 'uh-oh',
     '[uhn]': 'uh',
     '[um]': 'um',
     '[what]': 'what',
     '[woah]': 'woah',
     '[wor]': 'wor',
     '[wow]': 'wow',
     '[ya]': 'ya',
     '[出国]': '出国',
     '[ah[]': 'ah',
     '[ai]': 'ai',
     '[alh]': 'lah',
     '[kah]': '',
     '[laj]': 'lah',
     '[og]': '',
     '[or]': '',
     '[ow]': '',
     '[uhm]': 'uhm',
     '[uhum]': 'uhm',
}

In [2]:
config = WhisperConfig.from_pretrained('openai/whisper-large-v3')
maxlen = config.max_length - 3

In [3]:
tokenizer_small = AutoTokenizer.from_pretrained('openai/whisper-small')
tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

files = sorted(glob('imda*.jsonl'))
files

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['imda-part1.jsonl',
 'imda-part2.jsonl',
 'imda-part6-1.jsonl',
 'imda-part6-2.jsonl',
 'imda-part6-3.jsonl',
 'imda-same-part3.jsonl',
 'imda-same-part4.jsonl',
 'imda-same-part5.jsonl',
 'imda-separate-part3.jsonl',
 'imda-separate-part4.jsonl',
 'imda-separate-part5.jsonl']

In [19]:
filtered_words = []
for f in files:
    with open(f) as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            t = l['text'].strip()
            
            splitted = t.split()
            filtered = []
            for s in splitted:
                filtered.append(parse_lang(s))
            t = ' '.join(filtered)
            
            for r in replaces:
                t = t.replace(r, '')
            for r, v in replaces_with.items():
                t = t.replace(r, v)
            t = cleaning(t)
            
            splitted = t.split()
            for s in splitted:
                after = parse_lang(s)
                if after[0] in {'(', '[', '<'} and after[-1] in {')', ']', '>'}:
                    filtered_words.append(after)
set(filtered_words)

755724it [00:41, 17996.46it/s]
826469it [00:39, 20853.30it/s]
117890it [00:09, 12237.31it/s]
108538it [00:08, 12352.59it/s]
107820it [00:08, 12181.03it/s]
185448it [00:13, 13344.17it/s]
172166it [00:13, 13179.57it/s]
51982it [00:05, 10344.15it/s]
169876it [00:12, 13783.33it/s]
163431it [00:12, 13320.73it/s]
215412it [00:18, 11351.56it/s]


set()

In [35]:
def loop(files):
    files, _ = files
    results = []
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                l = json.loads(l)
                audio_filename = l.get('new_filename')
                if audio_filename is None:
                    audio_filename = l['filename']
                if not os.path.exists(audio_filename):
                    continue

                t = l['text'].strip()
                if len(t) < 2:
                    continue

                splitted = t.split()
                filtered = []
                for s in splitted:
                    filtered.append(parse_lang(s))
                t = ' '.join(filtered)

                for r in replaces:
                    t = t.replace(r, '')
                for r, v in replaces_with.items():
                    t = t.replace(r, v)
                t = cleaning(t)
                
                if len(t) < 2:
                    continue
                
                if t[0] in {',', '.'}:
                    t = t[1:]

                if len(t) < 2:
                    continue

                new_text = f'<|startoftranscript|><|en|><|transcribe|> {t}<|endoftext|>'
                input_ids = tokenizer(new_text, add_special_tokens = False).input_ids
                if len(input_ids) > maxlen:
                    continue

                l['new_text'] = new_text
                l['audio_filename'] = audio_filename
                results.append(l)
    
    return results

In [36]:
import mp

In [37]:
results = mp.multiprocessing(files, loop, cores = len(files))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [38]:
len(results)

2840417

In [39]:
with open('prepared-imda.jsonl', 'w') as fopen:
    for r in results:
        fopen.write(f'{json.dumps(r)}\n')

In [52]:
from streaming import MDSWriter, LocalDataset

In [44]:
columns = {
    'new_text': 'str',
    'audio_filename': 'str'
}

hashes = 'sha1', 'xxh64'

In [45]:
r

{'filename': 'part5-separate-audio-mp3/app_3773_5546_phnd_pos-59.mp3',
 'text': 'ya [lor] ya [lor] not bad not bad, can can you one week (uh) you one week go how many times maybe can find you, two three times [ah] [oh] okay okay okay',
 'new_text': '<|startoftranscript|><|en|><|transcribe|> ya lor ya lor not bad not bad, can can you one week you one week go how many times maybe can find you, two three times ah oh okay okay okay<|endoftext|>',
 'audio_filename': 'part5-separate-audio-mp3/app_3773_5546_phnd_pos-59.mp3'}

In [49]:
!rm -rf mosaic-imda

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [50]:
with MDSWriter(out='mosaic-imda', columns=columns, compression=None, hashes=hashes) as out:
    with open('prepared-imda.jsonl') as fopen:
        for l in tqdm(fopen):
            l = json.loads(l)
            sample = {
                'new_text': l['new_text'],
                'audio_filename': l['audio_filename'],
            }
            out.write(sample)

2840417it [00:22, 128350.45it/s]


In [53]:
dataset = LocalDataset('mosaic-imda')

In [54]:
dataset[0]

{'audio_filename': 'part1-mp3/000010001.mp3',
 'new_text': '<|startoftranscript|><|en|><|transcribe|> There were barrels of wine in the huge cellar.<|endoftext|>'}

In [55]:
!git clone https://huggingface.co/datasets/malaysia-ai/mosaic-imda-stt
!cp mosaic-imda/* mosaic-imda-stt

Cloning into 'mosaic-imda-stt'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 3, done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 3[K
Unpacking objects: 100% (3/3), 517 bytes | 517.00 KiB/s, done.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [57]:
!du -hs mosaic-imda-stt

537M	mosaic-imda-stt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
