In [1]:
import soundfile as sf
from glob import glob
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('openai/whisper-large-v3')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
timestamps = [i * 0.02 for i in range(1500 + 1)]

In [3]:
from glob import glob

files = glob('iban/**/*_text', recursive = True)
files

['iban/data/train/train_text', 'iban/data/test/test_text']

In [8]:
!head -n 1 iban/data/train/train_text

ibf_002_001  masjid al takwa di miri udah nerima agih belanja seratus ribu ringgit ari opis menteri anegi teknologi hijau enggau ai kena ngitu pemanah masjid nya  


In [39]:
!rm -rf iban-wav
!mkdir iban-wav

In [16]:
import json

with open('true-case.json') as fopen:
    data = json.load(fopen)
    
data[0]

{'id': 'ibf_002_001',
 'original': 'masjid al takwa di miri udah nerima agih belanja seratus ribu ringgit ari opis menteri anegi teknologi hijau enggau ai kena ngitu pemanah masjid nya  ',
 'true_case': 'Masjid Al-Takwa di Miri udah nerima agih belanja seratus ribu ringgit ari opis Menteri Anegi Teknologi Hijau Enggau Ai kena ngitu pemanah masjid-nya.'}

In [44]:
import os
import shutil
from tqdm import tqdm

filtered, rejected = [], []
for i in tqdm(range(len(data))):
    text = data[i]['true_case']
    folder = '_'.join(data[i]['id'].split('_')[:-1])
    old_filename = os.path.join('iban/data/wav/', folder, data[i]['id'] + '.wav')
    
    if not os.path.exists(old_filename):
        rejected.append(data[i])
        continue
        
    filename = os.path.join('iban-wav', data[i]['id'] + '.wav')
    y, sr = sf.read(old_filename)
    start = 0
    end = len(y) / sr
    if end >= 12:
        continue
    
    start = min(timestamps, key=lambda t: abs(t - start))
    end = min(timestamps, key=lambda t: abs(t - end))
    t = f"<|{start:.2f}|> {text}<|{end:.2f}|>"
    text = f"<|startoftranscript|><|ms|><|transcribe|>{t}<|endoftext|>"
    
    shutil.copyfile(old_filename, filename)
    filtered.append({
        'filename': filename,
        'Y': text
    })

100%|██████████████████████████████████████| 3132/3132 [00:08<00:00, 350.34it/s]


In [45]:
len(rejected)

0

In [46]:
len(filtered)

2200

In [52]:
filtered[-1]

{'filename': 'iban-wav/ibm_008_099.wav',
 'Y': '<|startoftranscript|><|ms|><|transcribe|><|0.00|> Salam Satu Malaysia.<|1.98|><|endoftext|>'}

In [53]:
!zip -rq iban-wav.zip iban-wav

In [54]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="iban-wav.zip",
    path_in_repo="iban-wav.zip",
    repo_id="malaysia-ai/iban-whisper-format",
    repo_type="dataset",
)

iban-wav.zip:   0%|          | 0.00/483M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/malaysia-ai/iban-whisper-format/commit/7a15c5cf4d0cb1820713bc42ff5907b13bddb5d0', commit_message='Upload iban-wav.zip with huggingface_hub', commit_description='', oid='7a15c5cf4d0cb1820713bc42ff5907b13bddb5d0', pr_url=None, pr_revision=None, pr_num=None)

In [55]:
with open('iban-dataset.json', 'w') as fopen:
    json.dump(filtered, fopen)