In [1]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from glob import glob
import soundfile as sf
import numpy as np
import os
from multiprocess import Pool
import itertools
from tqdm import tqdm

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))
    
def new_path(f):
    splitted = f.split('/')
    base_folder = splitted[0] + '_trim'
    splitted = '/'.join([base_folder] + splitted[1:])
    return splitted

In [2]:
from glob import glob

files = glob('*/**/*.mp3', recursive = True)
files = [f for f in files if 'trim/' not in f]
len(files)

5603411

In [3]:
import pandas as pd

df = pd.read_parquet('verify_text-00000-of-00001.parquet')
filtered_audio = set(df['audio'].tolist())

In [4]:
filtered_audio_fix = []
for f in filtered_audio:
    if 'ssd3/' in f:
        f = f.split('ssd3/')[1]
    elif 'ssd4/' in f:
        f = f.split('ssd3/')[1]
    filtered_audio_fix.append(f)
    
filtered_audio_fix = set(filtered_audio_fix)

In [5]:
files_ = [f for f in files if f in filtered_audio_fix]

In [6]:
len(files_), len(filtered_audio_fix)

(3164815, 3164815)

In [7]:
# !rm -rf *_trim

In [8]:
import malaya_speech

sr = 24000
min_length = 0.4
start_silent_trail = int(0.3 * sr)
middle_silent_trail = int(min_length * sr / 2)
middle_silent_trail, start_silent_trail

def loop(files):
    files, _ = files
    
    for f in tqdm(files):
        
        f_new = new_path(f)
        if os.path.exists(f_new):
            continue
        
        try:
            vad = malaya_speech.vad.webrtc(minimum_amplitude = 0)
            y, sr = sf.read(f)
            y_= malaya_speech.resample(y, sr, 16000)
            y_ = malaya_speech.astype.float_to_int(y_)
            frames = malaya_speech.generator.frames(y, 30, sr)
            frames_ = list(malaya_speech.generator.frames(y_, 30, 16000, append_ending_trail = False))
            frames_webrtc = [(frames[no], vad(frame)) for no, frame in enumerate(frames_)]
            grouped_deep = malaya_speech.group.group_frames(frames_webrtc)
            r = []
            for no, g in enumerate(grouped_deep):
                if g[1]:
                    g = g[0].array
                else:
                    if no == 0:
                        g = g[0].array[-start_silent_trail:]
                    elif no == (len(grouped_deep) - 1):
                        g = g[0].array[:start_silent_trail]
                    else:
                        if g[0].duration >= min_length:
                            g = [g[0].array[:middle_silent_trail], g[0].array[-middle_silent_trail:]]
                            g = np.concatenate(g)
                        else:
                            g = g[0].array

                r.append(g)
            y_after = np.concatenate(r)
            
            os.makedirs(os.path.split(f_new)[0], exist_ok = True)
            sf.write(f_new, y_after, sr)
            
        except Exception as e:
            print(e)



[2025-05-14 13:32:40,468] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [9]:
data = loop((files_[:1000], 0))

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 295477.56it/s]


In [13]:
multiprocessing(files_[:50000], loop, cores = 50, returned = False)

100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 215114.58it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 146204.13it/s]
  0%|                                                                                              | 0/1000 [00:00<?, ?it/s]
  0%|                                                                                              | 0/1000 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 126205.21it/s]
  0%|                                                                                              | 0/1000 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 110688.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 97451.30it/s]


In [14]:
selected = []

for f in tqdm(files_):
    left_size = os.path.getsize(f)
    try:
        right_size = os.path.getsize(new_path(f))
        if (right_size / left_size) < 0.8:
            selected.append((f, new_path(f)))
    except:
        pass

100%|█████████████████████████████████████████████████████████████████████████| 3164815/3164815 [00:15<00:00, 208466.15it/s]


In [15]:
len(selected)

3

In [16]:
selected[0]

('klasik_processed/kampong pisang berbuah dua kali - belanja besar [_aawWQslNXE]/kampong pisang berbuah dua kali - belanja besar [_aawWQslNXE]_3.mp3',
 'klasik_processed_trim/kampong pisang berbuah dua kali - belanja besar [_aawWQslNXE]/kampong pisang berbuah dua kali - belanja besar [_aawWQslNXE]_3.mp3')

In [21]:
import IPython.display as ipd
ipd.Audio(selected[2][0], rate = sr)

In [22]:
ipd.Audio(selected[2][1], rate = sr)