In [16]:
from glob import glob
from tqdm import tqdm
import soundfile as sf
import librosa
import os
import json
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [2]:
from datasets import load_dataset

ds = load_dataset("malaysia-ai/common_voice_17_0")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train = ds['train'].to_pandas()

In [33]:
from tqdm import tqdm
import string
import re
import pandas as pd

punct = set('!"#$%&()*+,./:;<=>?@[\\]^_`{|}~')
digits = set(string.digits)

mapping = {
    '‘': '\'',
    '“': '"',
    '”': '"',
    '–': '-',
    '—': '-',
    '’': '\'',
    '\t': '',
    '\n': '',
    '…': ' ',
}

def loop(indices):
    indices, _ = indices

    rows = pd.read_parquet('train-common-voice.parquet')
    
    chars = set()
    for i in tqdm(indices):
        s = rows.iloc[i]['sentence']
        if not isinstance(s, str):
            continue
        if len(set(s) & digits):
            continue
        t = s.lower()
        for k, v in mapping.items():
            t = t.replace(k, v)
        t = [c for c in t if c not in punct]
        t = re.sub(r'[ ]+', ' ', ''.join(t)).strip()
        chars.update(t)
    return list(chars)

In [29]:
chars = loop((range(1000), 0))

100%|██████████| 1000/1000 [00:00<00:00, 7145.94it/s]


In [34]:
chars = multiprocessing(range(len(train)), loop, cores = 10)

100%|██████████| 668955/668955 [01:33<00:00, 7129.42it/s] 
100%|██████████| 668955/668955 [01:34<00:00, 7095.53it/s] 
100%|██████████| 668955/668955 [01:36<00:00, 6938.11it/s]
100%|██████████| 668955/668955 [01:38<00:00, 6767.53it/s] 
100%|██████████| 668955/668955 [01:36<00:00, 6901.58it/s] 
100%|██████████| 668955/668955 [01:37<00:00, 6882.52it/s] 
100%|██████████| 668955/668955 [01:36<00:00, 6944.10it/s] 
100%|██████████| 668955/668955 [01:36<00:00, 6965.13it/s]
100%|██████████| 668955/668955 [01:39<00:00, 6691.20it/s]
100%|██████████| 668955/668955 [01:44<00:00, 6382.85it/s]


In [35]:
len(chars)

14585

In [38]:
with open('cv-17-vocab.json', 'w') as fopen:
    json.dump(list(set(chars)), fopen)

In [43]:
vocab = list(set(chars))

In [44]:
vocab = ['BLANK', 'PAD', 'UNK'] + vocab
vocab = {c: no for no, c in enumerate(vocab)}
rev_vocab = {v: k for k, v in vocab.items()}
pad_id = 1

In [47]:
data = train.iloc[0]
# y, sr = librosa.load(data['audio_filename'], sr = 16000)
s = data['sentence']
t = s.lower()
for k, v in mapping.items():
    t = t.replace(k, v)
t = [c for c in t if c not in punct]
t = re.sub(r'[ ]+', ' ', ''.join(t)).strip()
label = [vocab[c] for c in t]

In [51]:
import torch

torch.tensor(label)

tensor([7265, 5108, 4997, 5086, 7474, 5108, 6455, 2015, 5108, 6314, 3123, 3123,
        5108, 4997, 5086, 7474, 5108, 4997, 7474, 5108, 4997, 2015, 4997, 3123,
        6455, 7474, 5108, 5237, 1867, 5656, 7474, 4575, 5037, 1421, 5108, 7474,
        5108, 4162, 3123, 4079, 3123, 5108,  614, 7474])

In [57]:
def loop(indices):
    indices, _ = indices

    rows = pd.read_parquet('train-common-voice.parquet')
    
    filtered = []
    for i in tqdm(indices):
        s = rows.iloc[i]['sentence']
        if not isinstance(s, str):
            continue
        if len(s) < 3:
            continue
        if len(set(s) & digits):
            continue
        filtered.append(rows.iloc[i].to_dict())
    return filtered

In [55]:
filtered = loop((range(100), 0))

100%|██████████| 100/100 [00:00<00:00, 3162.10it/s]


In [58]:
filtered = multiprocessing(range(len(train)), loop, cores = 20)

100%|██████████| 334477/334477 [00:58<00:00, 5740.76it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6205.09it/s] 
100%|██████████| 334477/334477 [00:57<00:00, 5816.36it/s] 
100%|██████████| 334477/334477 [00:55<00:00, 6000.63it/s] 
100%|██████████| 334477/334477 [00:54<00:00, 6134.66it/s] 
100%|██████████| 334477/334477 [00:54<00:00, 6193.13it/s] 
100%|██████████| 334477/334477 [00:55<00:00, 6049.01it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6204.85it/s] 
100%|██████████| 334477/334477 [00:54<00:00, 6136.69it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6293.79it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6269.93it/s] 
100%|██████████| 334477/334477 [00:54<00:00, 6191.85it/s] 
100%|██████████| 334477/334477 [00:55<00:00, 6046.37it/s] 
100%|██████████| 334477/334477 [00:52<00:00, 6358.25it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6310.41it/s] 
100%|██████████| 334477/334477 [00:52<00:00, 6393.52it/s] 
100%|██████████| 334477/334477 [00:53<00:00, 6200.35it/s

In [59]:
len(filtered)

6688299

In [62]:
pd.DataFrame(filtered).to_parquet('filtered-train-cv-17.parquet')