In [1]:
# !wget https://huggingface.co/mesolitica/conformer-medium-malay-whisper/resolve/main/malay-stt.model
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [2]:
from glob import glob
import json
import soundfile as sf
from tqdm import tqdm
import mp
import os
import sentencepiece as spm
import malaya_speech
import numpy as np

sp_model = spm.SentencePieceProcessor(model_file='malay-stt.model')
sr = 16000

  def backtrace(trace: np.ndarray):
`pyaudio` is not available, `malaya_speech.streaming.pyaudio` is not able to use.


In [3]:
filtered = []

with open('pseudolabel.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        f = l['audio_filename']
        if not os.path.exists(f):
            continue
        t_ms = l['predict_ms'][41:-13].strip()
        t_en = l['predict_en'][41:-13].strip()
        filtered.append((f, t_ms, t_en))
        
len(filtered)

2221856

In [4]:
filtered = [(no, f) for no, f in enumerate(filtered)]

In [5]:
!mkdir processed-audio processed
!rm processed-audio/* processed/*

mkdir: cannot create directory ‘processed-audio’: File exists
mkdir: cannot create directory ‘processed’: File exists
/bin/bash: /usr/bin/rm: Argument list too long


In [6]:
def loop(rows):
    rows, index = rows
    for row in tqdm(rows):
        i, row = row
        filename = os.path.join('force-alignment', f'{i}.json')
        if not os.path.exists(filename):
            continue
        try:
            with open(filename) as fopen:
                data = json.load(fopen)
                
            if 'fatihah' in data['p_ms']:
                continue
                
            f_json = os.path.join('processed', f'{index}-{i}.json')
            if os.path.exists(f_json):
                continue
                
            y, sr = malaya_speech.load(row[0])
            
            split, temp = [], []
            if data['diag_ms'] is not None:
                diag = data['diag_ms']
                for no, r in enumerate(data['subwords_alignment_ms']):
                    if r['score'] >= 0.05 or diag[no] > 0.2:
                        temp.append(r)

                    else:
                        if len(temp):
                            split.append(temp)
                            temp = []

                if len(temp):
                    split.append(temp)
            
            if data['diag_en'] is not None:
                diag = data['diag_en']
                for no, r in enumerate(data['subwords_alignment_en']):
                    if r['score'] >= 0.05 or diag[no] > 0.2:
                        temp.append(r)

                    else:
                        if len(temp):
                            split.append(temp)
                            temp = []

                if len(temp):
                    split.append(temp)

            selected = []
            for s in split:
                start = s[0]['start']
                end = s[-1]['end'] + 0.1
                if end - start >= 1.0:
                    seq = [s__['text'] for s__ in s]
                    merged = sp_model.Decode(sp_model.PieceToId(seq))
                    selected.append((merged, start, end))
            
            if not len(selected):
                continue
                
            filtered = []
            for no, s in enumerate(selected):
                f = os.path.join('processed-audio', f'{index}-{i}-{no}.mp3')
                start = int(s[1] * sr)
                end = int(s[2] * sr)
                sf.write(f, y[start: end], sr)
                filtered.append(
                    {
                        'text': s[0],
                        'start': s[1],
                        'end': s[2],
                        'audio_filename': f,
                        'original_audio_filename': row[0],
                    }
                )
                
            with open(f_json, 'w') as fopen:
                json.dump(filtered, fopen)
            
        except Exception as e:
            print(e)
            pass

In [7]:
loop((filtered[:20], 0))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 202.72it/s]


In [8]:
files = glob('processed-audio/*.mp3')
len(files)

391143

In [9]:
files

['processed-audio/4-1779205-4.mp3',
 'processed-audio/6-1349719-2.mp3',
 'processed-audio/5-1115858-4.mp3',
 'processed-audio/3-696992-4.mp3',
 'processed-audio/3-673782-7.mp3',
 'processed-audio/3-669664-2.mp3',
 'processed-audio/3-674626-2.mp3',
 'processed-audio/3-681258-3.mp3',
 'processed-audio/6-1346184-0.mp3',
 'processed-audio/1-230642-1.mp3',
 'processed-audio/9-2019598-0.mp3',
 'processed-audio/4-1788075-2.mp3',
 'processed-audio/9-2008250-2.mp3',
 'processed-audio/6-1335604-1.mp3',
 'processed-audio/4-891318-4.mp3',
 'processed-audio/6-1337210-0.mp3',
 'processed-audio/3-690569-0.mp3',
 'processed-audio/3-699860-0.mp3',
 'processed-audio/7-1556205-2.mp3',
 'processed-audio/0-28096-0.mp3',
 'processed-audio/3-673140-1.mp3',
 'processed-audio/5-1124165-0.mp3',
 'processed-audio/5-1137588-0.mp3',
 'processed-audio/6-1345855-0.mp3',
 'processed-audio/0-7975-1.mp3',
 'processed-audio/9-2018090-1.mp3',
 'processed-audio/1-226828-2.mp3',
 'processed-audio/9-2016076-2.mp3',
 'proces

In [10]:
import IPython.display as ipd
ipd.Audio('processed-audio/0-13-1.mp3')

In [11]:
glob('processed/*.json')

['processed/1-225586.json',
 'processed/5-1112412.json',
 'processed/9-2012594.json',
 'processed/3-1337521.json',
 'processed/6-1349733.json',
 'processed/6-1358199.json',
 'processed/3-677734.json',
 'processed/3-673704.json',
 'processed/1-235185.json',
 'processed/8-1805701.json',
 'processed/3-1338126.json',
 'processed/7-1576922.json',
 'processed/0-8643.json',
 'processed/4-913341.json',
 'processed/0-34873.json',
 'processed/2-458642.json',
 'processed/7-1574638.json',
 'processed/1-452192.json',
 'processed/2-895997.json',
 'processed/7-1558601.json',
 'processed/0-5303.json',
 'processed/5-1125353.json',
 'processed/5-1127980.json',
 'processed/8-1784099.json',
 'processed/9-2002200.json',
 'processed/3-673148.json',
 'processed/4-905636.json',
 'processed/2-463300.json',
 'processed/9-2015489.json',
 'processed/2-896785.json',
 'processed/6-1348719.json',
 'processed/9-2002857.json',
 'processed/3-679831.json',
 'processed/0-25332.json',
 'processed/1-455237.json',
 'process

In [12]:
with open('processed/0-13.json') as fopen:
    data = json.load(fopen)

In [13]:
data

[{'text': 'dengan kata kata mutiara ni kan adakah itu berdasarkan pengalaman hidup ataupun dia',
  'start': 2.48,
  'end': 7.859999999999999,
  'audio_filename': 'processed-audio/0-13-0.mp3',
  'original_audio_filename': 'output-audio/1-0-13.mp3'},
 {'text': 'bar main create satu ayat dan sampaikan kepada orang ramai pula',
  'start': 7.96,
  'end': 11.459999999999999,
  'audio_filename': 'processed-audio/0-13-1.mp3',
  'original_audio_filename': 'output-audio/1-0-13.mp3'},
 {'text': 'k yang dah ada dia punya t',
  'start': 11.68,
  'end': 13.259999999999998,
  'audio_filename': 'processed-audio/0-13-2.mp3',
  'original_audio_filename': 'output-audio/1-0-13.mp3'},
 {'text': 'demark dah kalau kita nak buat lawak lawak saya okey',
  'start': 13.24,
  'end': 15.54,
  'audio_filename': 'processed-audio/0-13-3.mp3',
  'original_audio_filename': 'output-audio/1-0-13.mp3'},
 {'text': 'buat lawak okey tapi kadang kadang saya rasa ramai penonton ataupun',
  'start': 15.84,
  'end': 19.02,
  'au

In [14]:
mp.multiprocessing(filtered, loop, cores = 20, returned = False)

100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:28:34<00:00, 20.90it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:01<00:00, 11.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:47:39<00:00, 17.20it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:51:15<00:00, 16.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:53:22<00:00, 16.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:53:43<00:00, 16.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 111092/111092 [1:53:52<00:00, 16.26it/s]
100%|████████████████████████████████████

In [15]:
processed = glob('processed/*.json')
len(processed)

1054141

In [16]:
with open(processed[-1]) as fopen:
    d = json.load(fopen)
d

[{'text': 'tapi rakyat happy ke tak baik kami nak',
  'start': 1.36,
  'end': 3.9,
  'audio_filename': 'processed-audio/17-1964297-0.mp3',
  'original_audio_filename': 'output-audio/0-9443-49.mp3'},
 {'text': 'bahawa dua daripada majlis daerah',
  'start': 4.64,
  'end': 5.859999999999999,
  'audio_filename': 'processed-audio/17-1964297-1.mp3',
  'original_audio_filename': 'output-audio/0-9443-49.mp3'},
 {'text': 'mana di',
  'start': 14.24,
  'end': 15.299999999999999,
  'audio_filename': 'processed-audio/17-1964297-2.mp3',
  'original_audio_filename': 'output-audio/0-9443-49.mp3'},
 {'text': 'harkan juga',
  'start': 19.0,
  'end': 20.580000000000002,
  'audio_filename': 'processed-audio/17-1964297-3.mp3',
  'original_audio_filename': 'output-audio/0-9443-49.mp3'}]

In [18]:
import IPython.display as ipd

ipd.Audio('processed-audio/17-1964297-1.mp3')

In [19]:
!du -hs processed-audio

70G	processed-audio


In [20]:
!~/7zz -v10g a processed-audio.7z processed-audio > /dev/null

In [21]:
with open('processed.jsonl', 'w') as fopen_l:
    for f in tqdm(processed):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
        except:
            pass

100%|███████████████████████████████████████████████████████████████████████████████████████| 1054141/1054141 [03:16<00:00, 5363.46it/s]


In [22]:
from huggingface_hub import HfApi
api = HfApi()

In [28]:
api.upload_file(
    path_or_fileobj='processed.jsonl',
    path_in_repo='processed.jsonl',
    repo_id='mesolitica/pseudostreaming-malaysian-youtube-whisper-large-v3',
    repo_type='dataset',
)

processed.jsonl:   0%|          | 0.00/604M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/pseudostreaming-malaysian-youtube-whisper-large-v3/commit/66eed57011381779feff8de8b6f107d075c73b20', commit_message='Upload processed.jsonl with huggingface_hub', commit_description='', oid='66eed57011381779feff8de8b6f107d075c73b20', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
files = glob('processed-audio/*.mp3')
len(files)

2761432

In [25]:
def loop(files):
    files, _ = files
    total = 0
    for f in tqdm(files):
        y, sr = malaya_speech.load(f)
        total += len(y) / sr
    return [total]

In [26]:
totals = mp.multiprocessing(files, loop)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:30<00:00, 666.57it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 878.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:31<00:00, 665.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:30<00:00, 667.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:29<00:00, 667.37it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:31<00:00, 665.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 460238/460238 [11:31<00:00, 665.37it/s]


In [27]:
sum(totals) / 60 / 60

3407.871277396369

In [29]:
for f in glob('*7z*'):
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/pseudostreaming-malaysian-youtube-whisper-large-v3',
        repo_type='dataset',
    )

processed-audio.7z.002


processed-audio.7z.002:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

processed-audio.7z.007


processed-audio.7z.007:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

processed-audio.7z.005


processed-audio.7z.005:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

processed-audio.7z.006


processed-audio.7z.006:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

processed-audio.7z.004


processed-audio.7z.004:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

processed-audio.7z.003


processed-audio.7z.003:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

processed-audio.7z.001


processed-audio.7z.001:   0%|          | 0.00/10.7G [00:00<?, ?B/s]