In [1]:
from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import json
import os
import mp
import re

In [2]:
files = glob('malaysian-podcast_processed/**/*/*.json', recursive = True)
files.extend(glob('sg-podcast_processed/**/*/*.json', recursive = True))
files.extend(glob('filtered-24k_processed/**/*/*.json', recursive = True))
files.extend(glob('parlimen-24k-chunk_processed/**/*/*.json', recursive = True))

len(files)

129198

In [3]:
rejected = [
    'terima kasih kerana menonton',
    'terima kasih',
    'thank you for watching',
]

In [4]:
def loop(files):
    files, _ = files
    data = []
    for file in tqdm(files):
        folder = os.path.split(file)[0]
        folder_folder = os.path.split(folder)[1]
        filename = file.replace('.json', '')

        try:
            with open(file) as fopen:
                d = json.load(fopen)
        except:
            continue

        for no, obj in enumerate(d):
            text = obj["text"].strip()
            
            rt_ = re.sub('[^a-z ]+', '', text.lower()).strip()
            if any([s == rt_ for s in rejected]):
                continue
            
            try:
                dense = CountVectorizer(ngram_range = (3,3)).fit_transform([text]).todense()
                repeat = (dense > 3).sum() >= 1
                if repeat:
                    continue
            except:
                continue
            
            audio_path = os.path.join(folder, f'{folder_folder}_{no}.mp3')
            new_audio_path = audio_path.replace('processed/', 'processed_24k/')
            if os.path.exists(new_audio_path):
                audio_path = new_audio_path

            data.append({
                'audio': audio_path,
                'transcription': text,
            })
    
    return data

In [5]:
data = mp.multiprocessing(files, loop, cores = 10)

100%|████████████████████████████████████| 12919/12919 [00:53<00:00, 242.41it/s]
100%|████████████████████████████████████| 12919/12919 [00:53<00:00, 242.35it/s]
100%|████████████████████████████████████| 12919/12919 [00:53<00:00, 241.10it/s]
100%|████████████████████████████████████| 12919/12919 [00:53<00:00, 241.38it/s]
100%|████████████████████████████████████| 12919/12919 [00:53<00:00, 239.81it/s]
100%|████████████████████████████████████| 12919/12919 [00:54<00:00, 237.35it/s]
100%|████████████████████████████████████| 12919/12919 [00:54<00:00, 236.72it/s]
100%|████████████████████████████████████| 12919/12919 [00:54<00:00, 236.41it/s]
100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 10.23it/s]
100%|████████████████████████████████████| 12919/12919 [00:58<00:00, 221.15it/s]
100%|█████████████████████████████████████| 12919/12919 [03:36<00:00, 59.62it/s]


In [6]:
len(data)

2438225

In [10]:
data[-1]

{'audio': 'parlimen-24k-chunk_processed/parlimen-24k-LANGSUNG ： Persidangan Dewan Negara 21 MAC 2022 ｜ Sesi Petang [72fMZM9Rrek]_000/parlimen-24k-LANGSUNG ： Persidangan Dewan Negara 21 MAC 2022 ｜ Sesi Petang [72fMZM9Rrek]_000_435.mp3',
 'transcription': 'oleh Kementerian Sumber Manusia selari dengan realiti sebenar yang dihadapi oleh rakyat berbagai segmen.'}

In [11]:
import IPython.display as ipd
ipd.Audio(data[-1]['audio'])

In [12]:
with open('verify-text.jsonl', 'w') as fopen:
    for d in data:
        fopen.write(f'{json.dumps(d)}\n')
        fopen.flush()

In [13]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="verify-text.jsonl",
    path_in_repo="verify-text.jsonl",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

verify-text.jsonl:   0%|          | 0.00/862M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion/commit/43c1e706c60dc209b23b9e4d3bf8543296e976a3', commit_message='Upload verify-text.jsonl with huggingface_hub', commit_description='', oid='43c1e706c60dc209b23b9e4d3bf8543296e976a3', pr_url=None, pr_revision=None, pr_num=None)