In [1]:
from huggingface_hub import snapshot_download
from glob import glob
from streaming import MDSWriter, LocalDataset
import polars as pl
import json
import os

In [2]:
folder = snapshot_download(
    repo_id="mesolitica/Malaysian-STT-Whisper", 
    local_dir = './malaysian-stt',
    repo_type = 'dataset',
    max_workers = 10,
    allow_patterns="data/*.parquet",
)

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

In [3]:
!rm -rf mosaic-stt

In [4]:
from tqdm import tqdm

columns = {
    'audio_filename': 'str',
    'text': 'str',
}
hashes = 'sha1', 'xxh64'

reject_language = {
    '<|id|>': 'malaysian-stt/data/indonesian-00000-of-00001.parquet',
    '<|ta|>': 'malaysian-stt/data/tamil-00000-of-00001.parquet',
    '<|zh|>': 'malaysian-stt/data/mandarin-00000-of-00001.parquet',
    '<|en|>': 'malaysian-stt/data/science_ms-00000-of-00001.parquet',
}

accept = [
    'malaysian-stt/data/extra-00000-of-00001.parquet',
    'malaysian-stt/data/imda-00000-of-00001.parquet',
    'malaysian-stt/data/malaysian_context-00000-of-00001.parquet',
    'malaysian-stt/data/malaysian_context_v2-00000-of-00001.parquet',
    'malaysian-stt/data/science_context-00000-of-00001.parquet',
]

columns_stt = ['segment_timestamp', 'word_timestamp']

with MDSWriter(out='mosaic-stt', columns=columns, compression=None, hashes=hashes) as out:
    
    df = pl.read_parquet('synthetic-context-switching-word-timestamp.parquet')
    for i in tqdm(range(len(df))):
        audio_filename = df['audio_filename'][i]
        t = df['word_timestamp'][i]

        if len(t) < 10:
            continue
        if not os.path.exists(audio_filename):
            continue

        out.write({
            'audio_filename': audio_filename,
            'text': t,
        })
        
    
    for f in accept:
        df = pl.read_parquet(f)
        for i in tqdm(range(len(df))):
            audio_filename = df['audio_filename'][i]
            if 'malaysian_context_v2' in f:
                audio_filename = os.path.join('/home/husein/ssd3', audio_filename)
            if 'imda-' in f:
                audio_filename = os.path.join('/home/husein/ssd3', audio_filename)
                
            for c in columns_stt:
                t = df[c][i]

                if len(t) < 10:
                    continue
                if not os.path.exists(audio_filename):
                    continue

                out.write({
                    'audio_filename': audio_filename,
                    'text': t,
                })
    
    for k, f in reject_language.items():
        df = pl.read_parquet(f)
        for i in tqdm(range(len(df))):
            audio_filename = df['audio_filename'][i]
            if 'science_ms' in f:
                audio_filename = os.path.join('pseudolabel-science-large-v3-timestamp', audio_filename)
            t = df['segment_timestamp'][i]
            t = t.replace('<|transcribe|>', '<|translate|>')
            
            if len(t) < 10:
                continue
            if k in t:
                continue
            if not os.path.exists(audio_filename):
                continue
            
            out.write({
                'audio_filename': audio_filename,
                'text': t,
            })

100%|██████████████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:00<00:00, 140141.47it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 7746/7746 [00:00<00:00, 160734.38it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1009338/1009338 [00:11<00:00, 89995.38it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 299359/299359 [00:02<00:00, 99790.99it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 3305115/3305115 [00:37<00:00, 88527.96it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1960115/1960115 [00:20<00:00, 95700.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 943282/943282 [00:04<00:00, 197980.06it/s]
100%|███████████████████████████████████████████

In [5]:
dataset = LocalDataset('mosaic-stt')
len(dataset)

16405005

In [7]:
dataset[-1]

{'audio_filename': 'pseudolabel-science-large-v3-timestamp/chunk/mp3-16k-0-406_030.mp3',
 'text': '<|startoftranscript|><|ms|><|translate|><|0.00|> Tetapi ia mempunyai bahagian saintifik dengan janji sebenar, jadi jangan buang bayi itu dengan<|5.76|><|5.76|> air mandian. Para saintis telah bereksperimen dengan menanam elektrod ke dalam otak selama lebih daripada<|12.00|><|12.00|> 50 tahun. Ia dipanggil rangsangan otak mendalam dan berfungsi dengan menghantar impuls elektrik<|18.66|><|18.66|> ke kawasan tertentu otak. Ia telah berjaya digunakan untuk merawat sakit, kemurungan, Parkinson,<|25.44|><|endoftext|>'}

In [8]:
import IPython.display as ipd
ipd.Audio(dataset[0]['audio_filename'])

In [9]:
from transformers import AutoTokenizer, AddedToken

tokenizer = AutoTokenizer.from_pretrained('openai/whisper-small')

In [10]:
timestamps = [
    AddedToken(
        "<|%.2f|>" % (i * 0.02),
        lstrip=False,
        rstrip=False) for i in range(
        1500 + 1)]
timestamps.append(AddedToken('<|transcribeprecise|>'))
tokenizer.add_tokens(timestamps)

1