In [41]:
import soundfile as sf
from tqdm import tqdm
import os
import json
from datasets import load_dataset
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

langs = ['ab', 'af', 'am', 'ar', 'as', 'ast', 'az', 'ba', 'bas', 'be', 'bg', 'bn', 'br', 'ca', 'ckb', 'cnh', 'cs', 'cv', 'cy', 'da', 'de', 'dv', 'dyu', 'el', 'en', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'fy-NL', 'ga-IE', 'gl', 'gn', 'ha', 'he', 'hi', 'hsb', 'ht', 'hu', 'hy-AM', 'ia', 'id', 'ig', 'is', 'it', 'ja', 'ka', 'kab', 'kk', 'kmr', 'ko', 'ky', 'lg', 'lij', 'lo', 'lt', 'ltg', 'lv', 'mdf', 'mhr', 'mk', 'ml', 'mn', 'mr', 'mrj', 'mt', 'myv', 'nan-tw', 'ne-NP', 'nhi', 'nl', 'nn-NO', 'nso', 'oc', 'or', 'os', 'pa-IN', 'pl', 'ps', 'pt', 'quy', 'rm-sursilv', 'rm-vallader', 'ro', 'ru', 'rw', 'sah', 'sat', 'sc', 'sk', 'skr', 'sl', 'sq', 'sr', 'sv-SE', 'sw', 'ta', 'te', 'th', 'ti', 'tig', 'tk', 'tok', 'tr', 'tt', 'tw', 'ug', 'uk', 'ur', 'uz', 'vi', 'vot', 'yi', 'yo', 'yue', 'zgh', 'zh-CN', 'zh-HK', 'zh-TW', 'zu', 'zza']

In [26]:
len(langs) * 500

62000

In [49]:
!rm -rf common-voice common-voice-done

In [50]:
!mkdir common-voice
!mkdir common-voice-done

In [51]:
def loop(langs):
    langs, _ = langs
    for l in tqdm(langs):
        filename_done = os.path.join('common-voice-done', f'{l}.json')
        if os.path.exists(filename_done):
            continue
        try:
            cv_17 = load_dataset("mozilla-foundation/common_voice_17_0", l, split="train", streaming=True)
            index = 0
            data = []
            for row in cv_17:
                if len(row['age']) < 2 or len(row['gender']) < 2:
                    continue
                a = row['audio']['array']
                sr = row['audio']['sampling_rate']
                f = os.path.join('common-voice', f'{l}_{index}.mp3')
                sf.write(f, a, sr)
                row.pop('audio', None)
                row['filename'] = f
                data.append(row)
                index += 1
                if index >= 500:
                    break
            with open(filename_done, 'w') as fopen:
                json.dump(data, fopen)
        except:
            pass

In [52]:
loop((langs[:1], 0))

  0%|                                                                                                   | 0/1 [00:00<?, ?it/s]
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1it [00:01,  1.80s/it][A
Reading metadata...: 21027it [00:03, 5576.27it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.15s/it]


In [53]:
multiprocessing(langs, loop, cores = 15, returned = False)

  0%|                                                                                                   | 0/8 [00:00<?, ?it/s]
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1it [00:00,  3.28it/s][A
Reading metadata...: 776it [00:00, 2501.66it/s]

Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1879it [00:00, 3321.72it/s]

Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 82it [00:00, 273.42it/s]A

Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 14it [00:00, 48.35it/s][A

Reading metadata...: 1259it [00:00, 1595.91it/s]

Reading metadata...: 1it [00:00,  1.44it/s][A
Reading metadata..

 12%|███████████▍                                                                               | 1/8 [00:40<04:40, 40.12s/it]
Reading metadata...: 2616it [00:00, 4943.83it/s]

 12%|███████████▍                                                                               | 1/8 [00:40<04:44, 40.60s/it]
Reading metadata...: 671862it [00:36, 23796.06it/s][A
Reading metadata...: 686549it [00:36, 32715.45it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 693622it [00:37, 21988.08it/s][A
Reading metadata...: 1it [00:00,  1.26it/s][A
Reading metadata...: 703053it [00:37, 27421.01it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 709535it [00:37, 20178.33it/s][A
Reading metadata...: 11975it [00:01, 8963.79it/s][A
Reading metadata...: 7253it [00:00, 8950.37it/s]

Reading metadata...: 2298it [00:00, 5458.32it/s]

Reading metadata...: 723850it [00:38, 19544.20it/s][A
Reading metadata...: 738966it [00:38,

 38%|██████████████████████████████████▏                                                        | 3/8 [01:14<02:09, 25.88s/it]
Reading metadata...: 1022787it [01:10, 22286.90it/s][A
Reading metadata...: 25159it [00:02, 11372.75it/s][A

Reading metadata...: 1038466it [01:10, 24054.32it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1053922it [01:11, 21542.05it/s][A
Reading metadata...: 96it [00:00, 332.45it/s]A

 25%|██████████████████████▊                                                                    | 2/8 [01:17<03:49, 38.31s/it]
Reading metadata...: 0it [00:00, ?it/s][A
 25%|██████████████████████▊                                                                    | 2/8 [01:17<03:52, 38.77s/it]
Reading metadata...: 2215it [00:00, 2812.41it/s]

Reading metadata...: 1083483it [01:13, 19389.70it/s][A
Reading metadata...: 1101170it [01:13, 14908.70it/s][A

Reading metadata...: 3681it [00:00, 3760.80it/s]
 25%|██████████████████████▊                       

 62%|████████████████████████████████████████████████████████▉                                  | 5/8 [02:39<01:28, 29.43s/it]
Reading metadata...: 137806it [00:07, 17418.52it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 1it [00:00,  2.56it/s][A
Reading metadata...: 5148it [00:00, 12059.86it/s]s][A

Reading metadata...: 162348it [00:08, 22948.14it/s][A'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f2fc0d4a-8761-4e04-a1b5-9e4ffd0e891c)')' thrown while requesting GET https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0/resolve/main/audio/ast/train/ast_train_0.tar
Retrying in 1s [Retry 1/5].

Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 315it [00:00, 1149.74it/s]

Reading metadata...: 169887it [00:09, 16358.84it/s][A
Reading metadata...: 1it [00:00,  1.11it/s][A
Reading metadata...: 178556it [00:10, 12217.

 75%|████████████████████████████████████████████████████████████████████▎                      | 6/8 [03:27<01:07, 33.78s/it]
 50%|█████████████████████████████████████████████▌                                             | 2/4 [00:04<00:04,  2.07s/it]
Reading metadata...: 1it [00:00,  1.38it/s][A
 75%|████████████████████████████████████████████████████████████████████▎                      | 6/8 [03:30<01:15, 37.99s/it]
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 4689it [00:01, 3156.06it/s]
 75%|████████████████████████████████████████████████████████████████████▎                      | 3/4 [00:07<00:02,  2.73s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.39s/it]

Reading metadata...: 5141it [00:01, 4422.90it/s]█████████████▉                                  | 5/8 [03:34<02:30, 50.29s/it]

Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 9it [00:00, 33.19it/s][A
 75%|██

Reading metadata...: 422923it [00:16, 29412.47it/s][A
Reading metadata...: 429366it [00:17, 24166.10it/s][A
Reading metadata...: 442057it [00:17, 23896.56it/s][A
Reading metadata...: 458820it [00:18, 24117.43it/s][A
Reading metadata...: 475458it [00:18, 26191.93it/s][A
Reading metadata...: 490897it [00:18, 35376.08it/s][A
Reading metadata...: 497960it [00:19, 28110.77it/s][A
 38%|██████████████████████████████████▏                                                        | 3/8 [04:42<07:14, 86.94s/it]
Reading metadata...: 525211it [00:20, 21520.51it/s][A
Reading metadata...: 538272it [00:21, 28581.86it/s][A
100%|███████████████████████████████████████████████████████████████████████████████████████████| 8/8 [04:44<00:00, 35.51s/it]

Reading metadata...: 558265it [00:22, 22442.65it/s][A
Reading metadata...: 574792it [00:23, 19054.03it/s][A
Reading metadata...: 591367it [00:23, 22115.11it/s][A
Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 607964it [00:24, 2155

100%|███████████████████████████████████████████████████████████████████████████████████████████| 8/8 [08:43<00:00, 65.40s/it]
