In [1]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from glob import glob
import soundfile as sf
import numpy as np
import os
from multiprocess import Pool
import itertools
from tqdm import tqdm

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
    
def new_path(f):
    splitted = f.split('/')
    base_folder = splitted[0] + '_trim'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

In [3]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-Emilia-Audio-Tokens/resolve/main/data/malaysian_podcast-00000-of-00001.parquet

In [4]:
import pandas as pd

df = pd.read_parquet('malaysian_podcast-00000-of-00001.parquet')

In [7]:
df['target_audio']

Unnamed: 0,reference_audio,reference_text,target_audio,target_text
0,malaysian-podcast_processed/Akan Tiba Gerhana ...,"Gerahannya, penduduk dekat kawasan tersebut da...",malaysian-podcast_processed/Akan Tiba Gerhana ...,Hanya sebahagian sahaja kawasan cakra matahari...
1,malaysian-podcast_processed/Akan Tiba Gerhana ...,Hanya sebahagian sahaja kawasan cakra matahari...,malaysian-podcast_processed/Akan Tiba Gerhana ...,"Gerahannya, penduduk dekat kawasan tersebut da..."
2,malaysian-podcast_processed/Akan Tiba Gerhana ...,"Gerahannya, penduduk dekat kawasan tersebut da...",malaysian-podcast_processed/Akan Tiba Gerhana ...,"Macam mana, matahari, yang besar di tengah har..."
3,malaysian-podcast_processed/Akan Tiba Gerhana ...,"Macam mana, matahari, yang besar di tengah har...",malaysian-podcast_processed/Akan Tiba Gerhana ...,"Gerahannya, penduduk dekat kawasan tersebut da..."
4,malaysian-podcast_processed/Review Series - Th...,"kira macam dia punya musuh ni, unik dan daripa...",malaysian-podcast_processed/Review Series - Th...,dengan elemen zaman sekarang. Kalau macam teng...
...,...,...,...,...
359231,malaysian-podcast_processed/Tolong Pak Cik Pen...,"Tak sempat explain. Waktu tu, saya balik umrah...",malaysian-podcast_processed/Tolong Pak Cik Pen...,"Okay, dengan travel agency. Tapi, saya pergi s..."
359232,malaysian-podcast_processed/Tolong Pak Cik Pen...,"sampai kat sana, akan ada motorway sendiri. So...",malaysian-podcast_processed/Tolong Pak Cik Pen...,"Tak sempat explain. Waktu tu, saya balik umrah..."
359233,malaysian-podcast_processed/Tolong Pak Cik Pen...,"Tak sempat explain. Waktu tu, saya balik umrah...",malaysian-podcast_processed/Tolong Pak Cik Pen...,"Paramilik datang, dia check darah, tekanan dar..."
359234,malaysian-podcast_processed/Tolong Pak Cik Pen...,"sampai kat sana, akan ada motorway sendiri. So...",malaysian-podcast_processed/Tolong Pak Cik Pen...,"Okay, dengan travel agency. Tapi, saya pergi s..."


In [10]:
audio = df['reference_audio'].tolist() + df['target_audio'].tolist()
audio = list(set(audio))
len(audio)

238534

In [11]:
import malaya_speech

sr = 24000
min_length = 0.4
start_silent_trail = int(0.3 * sr)
middle_silent_trail = int(min_length * sr / 2)
middle_silent_trail, start_silent_trail

def loop(files):
    files, _ = files
    
    for f in tqdm(files):
        
        f_new = new_path(f)
        if os.path.exists(f_new):
            continue
        
        try:
            vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
            y, sr = sf.read(f)
            y_= malaya_speech.resample(y, sr, 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = malaya_speech.generator.frames(y, 30, sr)
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:start_silent_trail]
                    else:
                        if g[0].duration >= min_length:
                            g = [g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]]
                            g = np.concatenate(g)
                        else:
                            g = g[0].array

                r.append(g)
            y_after = np.concatenate(r)
            
            os.makedirs(os.path.split(f_new)[0], exist_ok = True)
            sf.write(f_new, y_after, sr)
            
        except Exception as e:
            print(e)



[2025-05-18 19:10:16,149] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [13]:
data = loop((audio[:1000], 0))

100%|██████████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:14<00:00,  7.44it/s]


In [14]:
multiprocessing(audio, loop, cores = 20, returned = False)

100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [24:19<00:00,  8.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  7.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:43<00:00,  7.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:43<00:00,  7.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:43<00:00,  7.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:44<00:00,  7.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:44<00:00,  7.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11926/11926 [26:45<00:00,  7.43it/s]
100%|███