In [1]:
from glob import glob
from sklearn.feature_extraction.text import CountVectorizer
import json
import os
import re
import itertools
import random
from tqdm import tqdm
from multiprocess import Pool
from collections import defaultdict

def generate_trigrams(text):
    words = text.split()
    return list(zip(words, words[1:], words[2:]))

def skip_trigrams(text):
    trigrams = generate_trigrams(text)
    count = defaultdict(int)
    total = 0
    for t in trigrams:
        count[''.join(t)] += 1
        total += 1
    if len(count.keys()) < 3:
        return True
    for k, v in count.items():
        if (v / total) > 0.2:
            return True
    return False

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))



In [2]:
files = glob('*/**/*.json', recursive = True)
len(files)

311725

In [16]:
rejected = [
    'terima kasih kerana menonton',
    'terima kasih',
    'thank you for watching',
]

def new_path(f):
    return f.replace('_processed/', '_processed_trim_moshi/').replace('.mp3', '.moshi')

def loop(files):
    files, _ = files
    data = []
    for file in tqdm(files):
        folder = os.path.split(file)[0]
        folder_folder = os.path.split(folder)[1]

        try:
            with open(file) as fopen:
                d = json.load(fopen)
        except:
            continue
            
        speakers = defaultdict(dict)

        for no, obj in enumerate(d):
            text = obj["text"].strip()
            
            rt_ = re.sub('[^a-z ]+', '', text.lower()).strip()
            if any([s == rt_ for s in rejected]):
                continue
                
            split = text.split()
            ones = [w for w in split if len(w) <= 1]
            if (len(ones) / len(split)) >= 0.5:
                continue
                
            if any([(len(set(w)) / len(w)) < 0.3 for w in split]):
                continue
            
            try:
                dense = CountVectorizer(ngram_range = (3,3)).fit_transform([text]).todense()
                repeat = (dense > 3).sum() >= 1
                if repeat:
                    continue
            except:
                continue
            
            audio_path = os.path.join(folder, f'{folder_folder}_{no}.mp3')
            
            if not os.path.exists(audio_path):
                continue
                
            if not os.path.exists(new_path(audio_path)):
                continue
            
            speakers[obj['speaker']][no] = {
                'audio': audio_path,
                'transcription': text,
            }
        
        for speaker in speakers.keys():
            data_ = []
            for row in speakers[speaker]:
                for row_ in speakers[speaker]:
                    if row == row_:
                        continue

                    data_.append({
                        'reference_audio': speakers[speaker][row]['audio'],
                        'reference_text': speakers[speaker][row]['transcription'],
                        'target_audio': speakers[speaker][row_]['audio'],
                        'target_text': speakers[speaker][row_]['transcription'],
                    })

            data.extend(random.sample(data_, min(len(data_), 30)))
        
    return data

In [17]:
r = loop((files[:10], 0))
len(r)

100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 173.78it/s]


66

In [6]:
import IPython.display as ipd
ipd.Audio(r[-1]['reference_audio'])

In [None]:
data = multiprocessing(files, loop, cores = 20)

100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:04<00:00, 242.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:04<00:00, 240.16it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:06<00:00, 235.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:06<00:00, 235.55it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:06<00:00, 233.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 703.22it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:07<00:00, 230.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15586/15586 [01:07<00:00, 230.02it/s]


In [None]:
len(data)

In [None]:
import pandas as pd

pd.DataFrame(data).to_parquet('permutate.parquet')

In [None]:
from datasets import load_dataset

dataset = load_dataset("parquet", data_files={'train': 'permutate.parquet'})

In [None]:
dataset.push_to_hub('mesolitica/Malaysian-Emilia-Sesame', private = True)