In [1]:
import json
import os
from tqdm import tqdm
from glob import glob
from streaming import MDSWriter, LocalDataset

columns = {
    'audio_filename': 'str',
    'text': 'str',
}
hashes = 'sha1', 'xxh64'

In [4]:
files = glob('prepared*.jsonl')
files = sorted([f for f in files if 'original' not in f])
files

['prepared-imda-ms.jsonl',
 'prepared-imda.jsonl',
 'prepared-indonesian-en.jsonl',
 'prepared-indonesian-ms.jsonl',
 'prepared-mandarin-en.jsonl',
 'prepared-mandarin-ms.jsonl',
 'prepared-mandarin.jsonl',
 'prepared-nusantara.jsonl',
 'prepared-pseudolabel-malaya.jsonl',
 'prepared-pseudolabel.jsonl',
 'prepared-tamil-en.jsonl',
 'prepared-tamil-ms.jsonl',
 'prepared-tamil.jsonl']

In [5]:
with open('prepared-tamil.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        break

In [6]:
l

{'new_text': '<|startoftranscript|><|ta|><|transcribe|><|0.00|>விமான போக்குவரத்து தொரையில் பாதுகாப்பு அம்சங்கள் உலகத்தரத்தில் உள்ள போதும் அவற்றை மேலும் மேம்படுத்த வேண்டியது அவசியம் இன்று குறிப்பிட்டார்.<|8.20|><|endoftext|>',
 'audio_filename': '/home/ubuntu/newsonair_v5/tamil/NSD-Tamil-Tamil-0715-0725-201810118533/sent_13.wav'}

In [7]:
with MDSWriter(out='mosaic-stt', columns=columns, compression=None, hashes=hashes) as out:
    for f in files:
        with open(f) as fopen:
            for l in tqdm(fopen):
                l = json.loads(l)
                t = l['new_text'].strip()
                audio_filename = l['audio_filename']
                d = {
                    'text': t,
                    'audio_filename': audio_filename,
                }
                out.write(d)
                    

1217190it [00:08, 140776.97it/s]
1861125it [00:17, 108155.99it/s]
301763it [00:04, 65894.29it/s]
320655it [00:05, 63604.28it/s]
558988it [00:04, 115797.50it/s]
537902it [00:12, 43100.50it/s] 
595542it [00:04, 137801.93it/s]
10984it [00:00, 76319.20it/s]
1089630it [00:08, 122636.31it/s]
3085595it [01:04, 47982.27it/s]
480967it [00:03, 153163.98it/s]
441858it [00:02, 159498.63it/s]
532262it [00:05, 105136.42it/s]


In [8]:
data = LocalDataset('mosaic-stt')
len(data)

11034461

In [9]:
data[0]

{'audio_filename': 'IMDA-STT/part1-mp3/008781545.mp3',
 'text': '<|startoftranscript|><|ms|><|transcribe|><|0.00|> Pemilikan kereta juga dilarang kecuali jika anda membeli salah satu daripada beberapa permit mahal terlebih dahulu.<|6.56|><|endoftext|>'}