In [1]:
from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import json
import os
import mp
import re
import subprocess
import soundfile as sf

In [2]:
files = glob('filtered-24k_processed/**/*/*.json', recursive = True)
# files.extend(glob('/home/husein/ssd3/dialects_processed/**/*/*.json', recursive = True))

len(files)

103307

In [3]:
rejected = [
    'terima kasih kerana menonton',
    'terima kasih',
    'thank you for watching',
]

In [4]:
threshold = -18

def new_path(f):
    f = f.replace('.mp3', '.alignment')
    f = f.replace('_processed/', '_processed_alignment/')
    return f

def new_path_lang(f):
    f = f.replace('.mp3', '.language')
    f = f.replace('_processed/', '_processed_language/')
    return f

def chunk(alignment, reject = threshold, minimum_length = 2.0):
    alls, temp = [], []
    for a in alignment:
        if a['score'] <= reject:
            if len(temp):
                temp[-1]['end'] = a['start']
                if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
                    alls.append(temp)
                temp = []
        else:
            temp.append(a)
            
    if len(temp):
        if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
            alls.append(temp)
    return alls
        
def clean(string):
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string.lower()

def detect_extra(word):
    word = clean(word)
    return word in [
        'eh', 'ehh', 'oh', 'ohh', 'uhm', 'uhmm',
        'um', 'em', 'emm', 'ah', 'ha', 'ok', 'okay',
        'so', 'yes', 'no', 'ah', 'aa', 'so', 'uh', 'um', 'eh', 'ha', 'oh', 'ye', 'haa',
        'oi', 'ya', 'leh', 'lah', 'haiya', 'hoi', 'haha', 'hahaha',
        'then', 'it s'
    ]

from collections import defaultdict

def generate_trigrams(text):
    words = text.split()
    return list(zip(words, words[1:], words[2:]))

def skip_trigrams(text):
    trigrams = generate_trigrams(text)
    count = defaultdict(int)
    total = 0
    for t in trigrams:
        count[''.join(t)] += 1
        total += 1
    if len(count.keys()) < 3:
        return True
    for k, v in count.items():
        if (v / total) > 0.2:
            return True
    return False

In [5]:
!rm -rf filtered-24k-chunk
!mkdir filtered-24k-chunk

In [6]:
def loop(files):
    files, _ = files
    data = []
    for file in tqdm(files):
        folder = os.path.split(file)[0]
        folder_folder = os.path.split(folder)[1]
        filename = file.replace('.json', '')

        try:
            with open(file) as fopen:
                d = json.load(fopen)
        except:
            continue

        for no, obj in enumerate(d):
            text = obj["text"].strip()
            
            rt_ = re.sub('[^a-z ]+', '', text.lower()).strip()
            if any([s == rt_ for s in rejected]):
                continue
            
            try:
                dense = CountVectorizer(ngram_range = (3,3)).fit_transform([text]).todense()
                repeat = (dense > 3).sum() >= 1
                if repeat:
                    continue
            except:
                continue
            
            audio_path = os.path.join(folder, f'{folder_folder}_{no}.mp3')
            
            if not os.path.exists(audio_path):
                continue
                
            align_path = new_path(audio_path)
            
            if not os.path.exists(align_path):
                continue
                
            with open(align_path) as fopen:
                align = json.load(fopen)
                
            for a in align:
                if detect_extra(a['text']):
                    a['score'] = 0.0
                
            scores = [a for a in align if a['score'] <= threshold]
            if not len(scores):
                continue
            
            chunks = chunk(align)
            if len(chunks):
                y, sr = sf.read(audio_path)
                for no, c in enumerate(chunks):
                    if len(c) == len(align):
                        continue
                    try:
                        
                        words = [c_['text'] for c_ in c if len(c_['text']) <= 1]
                        if (len(words) / len(c)) > 0.5:
                            print(c)
                            continue
                        
                        skip = False
                        
                        for c_ in c:
                            if (c_['end'] - c_['start']) >= 2:
                                skip = True
                                break
                        if skip:
                            continue
                        
                        for i in range(1, len(c), 1):
                            if (c[i]['start'] - c[i - 1]['end']) >= 2:
                                skip = True
                                break
                        if skip:
                            continue
                            
                        t = ' '.join([c_['text'] for c_ in c])
                        start = c[0]['start']
                        end = c[-1]['end']
                        
                        a = audio_path.replace('/', '_').replace('.mp3', '') 
                        a = os.path.join('filtered-24k-chunk', f'{a}_{no}.mp3')
                        if not os.path.exists(a):
                            sf.write(a, y[int(sr * start): int(sr * end)], sr)

                        data.append({
                            'audio': a,
                            'transcription': t,
                        })
                    except:
                        pass
    
    return data

In [7]:
d = loop((files[-100:], 0))

100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:45<00:00,  2.18it/s]


In [8]:
len(d)

757

In [9]:
d[-3]

{'audio': 'filtered-24k-chunk/filtered-24k_processed_01850-8_01850-8_76_1.mp3',
 'transcription': 'ini, uh, isyrak yang terakhir saya akan bagi, uh,'}

In [10]:
import IPython.display as ipd
ipd.Audio(d[-3]['audio'])

In [11]:
data = mp.multiprocessing(files, loop, cores = 5)

  1%|▍                                                                                  | 116/20661 [02:04<6:54:02,  1.21s/it]

[{'start': 3.48, 'end': 3.52, 'text': '颜', 'score': -13.02734375}, {'start': 3.56, 'end': 3.62, 'text': '颜', 'score': -8.6419677734375}, {'start': 3.66, 'end': 3.7, 'text': '颜', 'score': -14.27734375}, {'start': 3.74, 'end': 3.78, 'text': '颜', 'score': -12.08984375}, {'start': 3.82, 'end': 3.86, 'text': '颜', 'score': -17.04296875}, {'start': 3.92, 'end': 4.02, 'text': '颜', 'score': -9.484573364257812}, {'start': 4.06, 'end': 4.1, 'text': '颜', 'score': -5.61328125}, {'start': 4.14, 'end': 4.22, 'text': '颜', 'score': -8.966827392578125}, {'start': 4.26, 'end': 4.36, 'text': '颜', 'score': -8.90606689453125}, {'start': 4.4, 'end': 4.46, 'text': '颜', 'score': -11.209228515625}, {'start': 4.52, 'end': 4.6, 'text': '颜', 'score': -9.453086853027344}, {'start': 4.66, 'end': 4.74, 'text': '颜', 'score': -9.673635482788086}, {'start': 4.78, 'end': 4.82, 'text': '颜', 'score': -16.50390625}, {'start': 4.86, 'end': 4.94, 'text': '颜', 'score': -6.964752197265625}, {'start': 4.98, 'end': 5.02, 'text': 

 21%|█████████████████▍                                                                | 4397/20661 [44:33<2:42:52,  1.66it/s]

[{'start': 0.32, 'end': 0.32, 'text': 'S', 'score': 0.0}, {'start': 0.94, 'end': 0.94, 'text': 'O', 'score': 0.0}, {'start': 1.32, 'end': 1.32, 'text': 'U', 'score': 0.0}, {'start': 2.02, 'end': 2.02, 'text': 'F', 'score': 0.0}, {'start': 2.42, 'end': 2.42, 'text': 'F', 'score': 0.0}, {'start': 2.82, 'end': 2.82, 'text': 'L', 'score': 0.0}, {'start': 2.96, 'end': 2.96, 'text': 'E.', 'score': 0.0}, {'start': 3.34, 'end': 3.76, 'text': 'Pinky.', 'score': -8.088783264160156}, {'start': 4.86, 'end': 4.98, 'text': 'Dia', 'score': -1.1874074935913086}]


 53%|█████████████████████████████████████████▋                                     | 10888/20661 [1:31:33<1:30:10,  1.81it/s]

[{'start': 0.08, 'end': 0.2, 'text': 'And', 'score': -8.619319915771484}, {'start': 0.32, 'end': 0.76, 'text': 'business,', 'score': -11.76527214050293}, {'start': 1.04, 'end': 1.04, 'text': 'M', 'score': 0.0}, {'start': 1.18, 'end': 1.18, 'text': 'G', 'score': 0.0}, {'start': 1.72, 'end': 1.72, 'text': 'S', 'score': 0.0}, {'start': 1.9, 'end': 1.9, 'text': 'E', 'score': 0.0}, {'start': 2.08, 'end': 2.4, 'text': 'B,', 'score': 0.0}]


 63%|█████████████████████████████████████████████████▋                             | 12987/20661 [1:40:17<1:00:23,  2.12it/s]

[{'start': 0.08, 'end': 0.08, 'text': 'I', 'score': 0.0}, {'start': 0.14, 'end': 0.14, 'text': 'P', 'score': 0.0}, {'start': 0.3, 'end': 0.3, 'text': 'T', 'score': 0.0}, {'start': 0.52, 'end': 0.52, 'text': 'A', 'score': 0.0}, {'start': 0.7, 'end': 0.88, 'text': 'atau', 'score': -0.19950592517852783}, {'start': 1.02, 'end': 1.02, 'text': 'I', 'score': 0.0}, {'start': 1.1, 'end': 1.1, 'text': 'P', 'score': 0.0}, {'start': 1.22, 'end': 1.22, 'text': 'T', 'score': 0.0}, {'start': 1.56, 'end': 1.56, 'text': 'S', 'score': 0.0}, {'start': 1.8, 'end': 1.96, 'text': 'untuk', 'score': -0.46672284603118896}, {'start': 2.02, 'end': 2.22, 'text': 'pulang', 'score': -0.0760965347290039}, {'start': 2.26, 'end': 2.3, 'text': 'ke', 'score': -0.02320098876953125}, {'start': 2.38, 'end': 2.76, 'text': 'kampung', 'score': -0.3110448122024536}, {'start': 2.86, 'end': 2.96, 'text': 'dan', 'score': -0.013359904289245605}, {'start': 3.14, 'end': 3.94, 'text': 'berkemungkinan', 'score': -2.1175222396850586}]


 64%|██████████████████████████████████████████████████▌                            | 13238/20661 [1:48:36<1:08:13,  1.81it/s]

[{'start': 5.8, 'end': 8.48, 'text': '.', 'score': 0.0}]


 72%|██████████████████████████████████████████████████████████                       | 14812/20661 [1:51:38<30:13,  3.23it/s]

[{'start': 4.0, 'end': 4.94, 'text': 'rupiah.', 'score': -1.654960036277771}, {'start': 5.02, 'end': 5.02, 'text': 'R', 'score': 0.0}, {'start': 5.22, 'end': 5.22, 'text': 'U', 'score': 0.0}, {'start': 5.38, 'end': 5.38, 'text': 'P', 'score': 0.0}, {'start': 5.46, 'end': 5.46, 'text': 'E', 'score': 0.0}, {'start': 5.64, 'end': 5.64, 'text': 'E.', 'score': 0.0}, {'start': 6.54, 'end': 7.18, 'text': 'India.', 'score': -0.18845820426940918}]


100%|█████████████████████████████████████████████████████████████████████████████████| 20661/20661 [2:27:55<00:00,  2.33it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 992.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20661/20661 [2:31:09<00:00,  2.28it/s]
 97%|██████████████████████████████████████████████████████████████████████████████▎  | 19986/20661 [2:32:14<05:12,  2.16it/s]

[{'start': 2.2, 'end': 2.24, 'text': 'got', 'score': -17.3125}, {'start': 2.28, 'end': 2.34, 'text': 'any', 'score': -7.719001770019531}, {'start': 2.38, 'end': 2.38, 'text': 'k', 'score': 0.0}, {'start': 2.42, 'end': 2.42, 'text': 'k', 'score': 0.0}, {'start': 2.46, 'end': 2.46, 'text': 'k', 'score': 0.0}, {'start': 2.5, 'end': 2.5, 'text': 'k', 'score': 0.0}, {'start': 2.54, 'end': 2.54, 'text': 'k', 'score': 0.0}, {'start': 2.58, 'end': 2.58, 'text': 'k', 'score': 0.0}, {'start': 2.62, 'end': 2.62, 'text': 'k', 'score': 0.0}, {'start': 2.66, 'end': 2.66, 'text': 'k', 'score': 0.0}, {'start': 2.7, 'end': 2.7, 'text': 'k', 'score': 0.0}, {'start': 2.74, 'end': 2.74, 'text': 'k', 'score': 0.0}, {'start': 2.78, 'end': 2.78, 'text': 'k', 'score': 0.0}, {'start': 2.82, 'end': 2.82, 'text': 'k', 'score': 0.0}, {'start': 2.86, 'end': 2.86, 'text': 'k', 'score': 0.0}, {'start': 2.9, 'end': 2.9, 'text': 'k', 'score': 0.0}, {'start': 2.94, 'end': 2.94, 'text': 'k', 'score': 0.0}, {'start': 2.9




100%|█████████████████████████████████████████████████████████████████████████████████| 20661/20661 [2:34:06<00:00,  2.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20661/20661 [2:34:41<00:00,  2.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20661/20661 [2:36:30<00:00,  2.20it/s]


In [12]:
len(data)

912616

In [13]:
from collections import defaultdict

uniques = defaultdict(int)
for d in tqdm(data):
    uniques[d['audio'].split('_processed')[0]] += 1
    
uniques

100%|████████████████████████████████████████████████████████████████████████████| 912616/912616 [00:00<00:00, 4131758.26it/s]


defaultdict(int, {'filtered-24k-chunk/filtered-24k': 912616})

In [14]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,audio,transcription
0,filtered-24k-chunk/filtered-24k_processed_0029...,that Mei Bang manage to make positive profit a...
1,filtered-24k-chunk/filtered-24k_processed_0029...,"Uh, lost making for like ten, ten years"
2,filtered-24k-chunk/filtered-24k_processed_0029...,a good time for them to exit. But back then I
3,filtered-24k-chunk/filtered-24k_processed_0029...,"read the signs, ah. I was thinking that, okay ..."
4,filtered-24k-chunk/filtered-24k_processed_0029...,"Yeah. So, they were exiting while I was enteri..."


In [15]:
df.to_parquet('verify-text-chunk-filtered-24k.parquet')

In [16]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="verify-text-chunk-filtered-24k.parquet",
    path_in_repo="data/filtered_24k-00000-of-00001.parquet",
    repo_id="mesolitica/Malaysian-Voice-Conversion",
    repo_type="dataset",
)

verify-text-chunk-filtered-24k.parquet:   0%|          | 0.00/69.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion/commit/660b6e1fb0adb90ef05ae511eee284e621d1761c', commit_message='Upload data/filtered_24k-00000-of-00001.parquet with huggingface_hub', commit_description='', oid='660b6e1fb0adb90ef05ae511eee284e621d1761c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Malaysian-Voice-Conversion', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Malaysian-Voice-Conversion'), pr_revision=None, pr_num=None)