In [1]:
import json
import os
import mp
import numpy as np
from collections import defaultdict
from glob import glob
from tqdm import tqdm
import soundfile as sf
import re

timestamps = [i * 0.02 for i in range(1500 + 1)]

In [2]:
from huggingface_hub import hf_hub_download
import fasttext

filename = hf_hub_download(
    repo_id="mesolitica/fasttext-language-detection-bahasa-en", 
    filename="fasttext.ftz"
)
lang_model = fasttext.load_model(filename)

In [3]:
def chunk(alignment, reject = -7, minimum_length = 2.0):
    alls, temp = [], []
    for a in alignment:
        if a['score'] <= reject:
            if len(temp):
                temp[-1]['end'] = a['start']
                if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
                    alls.append(temp)
                temp = []
        else:
            temp.append(a)
            
    if len(temp):
        if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
            alls.append(temp)
    return alls

In [4]:
data = defaultdict(list)
with open('prepared-imda.jsonl') as fopen:
    for no, l in tqdm(enumerate(fopen)):
        l = json.loads(l)
        data[l['audio_filename']].append((no, l))
        
len(data)

1861125it [00:05, 331305.90it/s]


1861082

In [5]:
rows = list(data.values())
len(rows)

1861082

In [6]:
rows[0]

[(0,
  {'new_text': '<|startoftranscript|><|en|><|transcribe|><|0.00|> Households with target sets were encouraged to try keeping their water consumption below these designated levels.<|7.00|><|endoftext|>',
   'audio_filename': 'IMDA-STT/part1-mp3/000010101.mp3'})]

In [7]:
!rm -rf prepared-imda-chunks
!mkdir prepared-imda-chunks

In [10]:
def loop(data):
    data, _ = data
    new_data = []
    for d in tqdm(data):
        
        aligns, scores = [], []
        for i in d:
            f = f'prepared-imda_alignment/{i[0]}.alignment'
            try:
                with open(f) as fopen:
                    align = json.load(fopen)
                    score = np.sum([s['score'] for s in align])
                    aligns.append(align)
                    scores.append(score)
            except:
                aligns.append([])
                scores.append(-9999)

        argmax = np.argmax(scores)
        no = d[argmax][0]
        text = d[argmax][1]['new_text']
        lang = text.split('<|startoftranscript|><|')[1].split('|')[0]
        cleaned_text = re.sub(r"<\|.*?\|>", "", text).strip()
        if lang_model.predict(cleaned_text)[0][0] == '__label__english':
            predict_lang = 'en'
        else:
            predict_lang = 'ms'
        
        chunks = chunk(aligns[argmax])
        audio_filename = d[argmax][1]['audio_filename'].replace('IMDA-STT/', '')
        if len(chunks):
            y, sr = sf.read(audio_filename)
            for k, c in enumerate(chunks):
                
                skip = False
                for c_ in c:
                    if (c_['end'] - c_['start']) > 2:
                        skip = True
                        break
                if skip:
                    continue
                    
                new_f = os.path.join('prepared-imda-chunks', f'{no}-{k}.mp3')
                if not os.path.exists(new_f):
                    sf.write(new_f, y[int(sr * c[0]['start']): int(sr * c[-1]['end'])], sr)
                ts = []
                
                min_t = min([c_['start'] for c_ in c])
                
                for c_ in c:
                    start = min(timestamps, key=lambda t: abs(t - (c_['start'] - min_t)))
                    end = min(timestamps, key=lambda t: abs(t - (c_['end'] - min_t)))
                    w = c_['text']
                    t = f"<|{start:.2f}|> {w}<|{end:.2f}|>"
                    ts.append(t)
                ts = ''.join(ts)
                new_text = text = f"<|startoftranscript|><|{predict_lang}|><|transcribeprecise|>{ts}<|endoftext|>"
                new_data.append({
                    'audio_filename': new_f,
                    'new_text': new_text,
                })
                
    return new_data

In [11]:
r = loop((rows[:100], 0))
len(r)

100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 81.48it/s]


70

In [12]:
r[-1]

{'audio_filename': 'prepared-imda-chunks/99-0.mp3',
 'new_text': '<|startoftranscript|><|en|><|transcribeprecise|><|0.00|> that<|0.10|><|0.16|> the<|0.20|><|0.28|> investigations<|1.26|><|1.34|> have<|1.46|><|1.56|> reached<|1.80|><|1.86|> a<|1.86|><|1.98|> critical<|2.40|><|2.52|> stage<|2.80|><|3.10|> with<|3.22|><|3.28|> a<|3.28|><|3.34|> higher<|3.60|><|3.66|> level<|3.96|><|4.04|> of<|4.10|><|4.22|> urgency<|4.62|><|4.76|> and<|4.84|><|4.92|> sensitivity.<|6.16|><|endoftext|>'}

In [13]:
r = mp.multiprocessing(rows, loop, cores = 20)

100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [25:54<00:00, 59.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 15.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [27:12<00:00, 57.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [27:33<00:00, 56.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [29:26<00:00, 52.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [30:05<00:00, 51.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [32:02<00:00, 48.39it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 93054/93054 [38:35<00

In [14]:
len(r)

1633511

In [15]:
r[-10]

{'audio_filename': 'prepared-imda-chunks/1861119-1.mp3',
 'new_text': '<|startoftranscript|><|en|><|transcribeprecise|><|0.00|> has<|0.16|><|0.24|> an<|0.30|><|0.72|> EduSafe<|1.14|><|1.18|> account.<|1.50|><|2.30|> So<|2.38|><|2.52|> would<|2.68|><|endoftext|>'}

In [16]:
import IPython.display as ipd
ipd.Audio(r[-10]['audio_filename'])

In [17]:
with open('imda-whisper-word-timestamp.jsonl', 'w') as fopen:
    for r_ in r:
        fopen.write(f'{json.dumps(r_)}\n')
        fopen.flush()

In [18]:
!ls -lh imda-whisper-word-timestamp.jsonl

-rw-rw-r-- 1 husein husein 507M Jan  10 06:10 imda-whisper-word-timestamp.jsonl


In [19]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="imda-whisper-word-timestamp.jsonl",
    path_in_repo="imda-whisper-word-timestamp.jsonl",
    repo_id="mesolitica/Malaysian-STT-Whisper",
    repo_type="dataset",
)

imda-whisper-word-timestamp.jsonl:   0%|          | 0.00/531M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper/commit/3105e6b4403253aaf48cb5fed9f65bcf3233fcb7', commit_message='Upload imda-whisper-word-timestamp.jsonl with huggingface_hub', commit_description='', oid='3105e6b4403253aaf48cb5fed9f65bcf3233fcb7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-STT-Whisper', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-STT-Whisper'), pr_revision=None, pr_num=None)

In [20]:
!du -hs prepared-imda-chunks

36G	prepared-imda-chunks


In [None]:
!zip -q -r -s 10000m prepared-imda-chunks.zip prepared-imda-chunks

In [4]:
from huggingface_hub import HfApi
from glob import glob

api = HfApi()

for f in glob('prepared-imda-chunks.z*'):
    print(f, f'prepared-imda_alignment/{f}')
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f'prepared-imda_alignment/{f}',
        repo_id="mesolitica/Malaysian-STT-Whisper",
        repo_type="dataset",
    )

prepared-imda-chunks.z02 prepared-imda_alignment/prepared-imda-chunks.z02


prepared-imda-chunks.z02:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

prepared-imda-chunks.z03 prepared-imda_alignment/prepared-imda-chunks.z03


prepared-imda-chunks.z03:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

prepared-imda-chunks.z01 prepared-imda_alignment/prepared-imda-chunks.z01


prepared-imda-chunks.z01:   0%|          | 0.00/10.5G [00:00<?, ?B/s]

prepared-imda-chunks.zip prepared-imda_alignment/prepared-imda-chunks.zip


prepared-imda-chunks.zip:   0%|          | 0.00/3.04G [00:00<?, ?B/s]