In [2]:
import webrtcvad
import wave
import contextlib

def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        sample_rate = wf.getframerate()
        frames = wf.readframes(wf.getnframes())
        return frames, sample_rate
    

def write_wave(path, audio, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)

def find_best_chunk(audio_path, chunk_duration_ms=30000, aggressiveness=3):
    vad = webrtcvad.Vad(aggressiveness)
    frames, sample_rate = read_wave(audio_path)

    frame_duration = 30
    frame_size = int(sample_rate * frame_duration / 1000) * 2


    speech_frames = []
    current_speech = b""
    max_speech = b""

    for i in range(0, len(frames), frame_size):
        frame = frames[i:i + frame_size]
        if len(frame) < frame_size:
            break
        if vad.is_speech(frame, sample_rate):
            current_speech += frame
        else:
            if len(current_speech) > len(max_speech):
                max_speech = current_speech
            current_speech = b""


    if len(current_speech) > len(max_speech):
        max_speech = current_speech


    max_duration_bytes = (sample_rate * chunk_duration_ms // 1000) * 2
    best_chunk = max_speech[:max_duration_bytes]


    output_path = audio_path.replace(".wav", "_best_chunk.wav")
    write_wave(output_path, best_chunk, sample_rate)
    print(f"Best chunk saved to {output_path}")

audio_path = " "
find_best_chunk(audio_path, chunk_duration_ms=30000)

In [1]:
import webrtcvad
import wave
import contextlib
import os

def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        sample_width = wf.getsampwidth()
        sample_rate = wf.getframerate()
        frames = wf.readframes(wf.getnframes())
        return frames, sample_rate

def write_wave(path, audio, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)

def find_best_chunk(audio_path, chunk_duration_ms=30000, aggressiveness=3):
    vad = webrtcvad.Vad(aggressiveness)
    frames, sample_rate = read_wave(audio_path)

    frame_duration = 30
    frame_size = int(sample_rate * frame_duration / 1000) * 2

    speech_frames = []
    current_speech = b""
    max_speech = b""

    for i in range(0, len(frames), frame_size):
        frame = frames[i:i + frame_size]
        if len(frame) < frame_size:
            break
        if vad.is_speech(frame, sample_rate):
            current_speech += frame
        else:
            if len(current_speech) > len(max_speech):
                max_speech = current_speech
            current_speech = b""

    if len(current_speech) > len(max_speech):
        max_speech = current_speech

    max_duration_bytes = (sample_rate * chunk_duration_ms // 1000) * 2
    best_chunk = max_speech[:max_duration_bytes]

    output_path = audio_path.replace(".wav", "_best_chunk.wav")
    write_wave(output_path, best_chunk, sample_rate)
    print(f"Best chunk saved to {output_path}")

def process_folder(folder_path, chunk_duration_ms=30000, aggressiveness=3):
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            audio_path = os.path.join(folder_path, filename)
            find_best_chunk(audio_path, chunk_duration_ms, aggressiveness)

# Folder path containing the audio files
folder_path = "train_pfstar"
process_folder(folder_path, chunk_duration_ms=30000)


Best chunk saved to train_pfstar/001m14bh_w_list1_best_chunk.wav
Best chunk saved to train_pfstar/004m10bh_list_ph09_best_chunk.wav
Best chunk saved to train_pfstar/007m10nl_w_list2_best_chunk.wav
Best chunk saved to train_pfstar/045m08nl_list_ph010_best_chunk.wav
Best chunk saved to train_pfstar/123f07nl_w_list3b_best_chunk.wav
Best chunk saved to train_pfstar/186m10sp_sentences1or6b_best_chunk.wav
Best chunk saved to train_pfstar/002f12bh_w_list2_best_chunk.wav
Best chunk saved to train_pfstar/204m08sp_list_ph04_best_chunk.wav
Best chunk saved to train_pfstar/022m11bh_w_list2_best_chunk.wav
Best chunk saved to train_pfstar/082f07nl_list_ph07_best_chunk.wav
Best chunk saved to train_pfstar/020m12bh_digits5_best_chunk.wav
Best chunk saved to train_pfstar/208f08sp_digits3_best_chunk.wav
Best chunk saved to train_pfstar/207f08sp_w_list2a_best_chunk.wav
Best chunk saved to train_pfstar/044m06nl_digits9_best_chunk.wav
Best chunk saved to train_pfstar/248f08sp_list_3_best_chunk.wav
Best chu

In [2]:
import webrtcvad
import wave
import contextlib
import os

def read_wave(path):
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        sample_rate = wf.getframerate()
        frames = wf.readframes(wf.getnframes())
        return frames, sample_rate

def write_wave(path, audio, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)

def remove_silence(audio_path, output_folder, chunk_duration_ms=2000, aggressiveness=3):
    vad = webrtcvad.Vad(aggressiveness)
    frames, sample_rate = read_wave(audio_path)

    frame_duration = 20  # Duration of each frame in ms
    frame_size = int(sample_rate * frame_duration / 1000) * 2

    speech_audio = b""

    # Process each frame and keep only the ones with detected speech
    for i in range(0, len(frames), frame_size):
        frame = frames[i:i + frame_size]
        if len(frame) < frame_size:
            break
        if vad.is_speech(frame, sample_rate):
            speech_audio += frame

    # Truncate to 3 seconds if necessary
    max_duration_bytes = (sample_rate * chunk_duration_ms // 1000) * 2
    best_chunk = speech_audio[:max_duration_bytes]

    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Create the output file path in the `no_silence` folder
    output_path = os.path.join(output_folder, os.path.basename(audio_path).replace(".wav", ".wav"))
    write_wave(output_path, best_chunk, sample_rate)
    print(f"Processed audio saved to {output_path}")

def process_folder(folder_path, chunk_duration_ms=2000, aggressiveness=3):
    output_folder = os.path.join(folder_path, "chunk_2s")
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            audio_path = os.path.join(folder_path, filename)
            remove_silence(audio_path, output_folder, chunk_duration_ms, aggressiveness)

# Folder path containing the audio files
folder_path = "test_pfstar"
process_folder(folder_path, chunk_duration_ms=2000)


Processed audio saved to test_pfstar/chunk_2s/005m08bh_w_list5b.wav
Processed audio saved to test_pfstar/chunk_2s/021f11nl_digits6.wav
Processed audio saved to test_pfstar/chunk_2s/004m10bh_list_ph04.wav
Processed audio saved to test_pfstar/chunk_2s/082f06nl_digits7.wav
Processed audio saved to test_pfstar/chunk_2s/029m10nl_list_4.wav
Processed audio saved to test_pfstar/chunk_2s/042f06nl_list_ph02.wav
Processed audio saved to test_pfstar/chunk_2s/196f08sp_list_1.wav
Processed audio saved to test_pfstar/chunk_2s/020m12bh_w_list5b.wav
Processed audio saved to test_pfstar/chunk_2s/025m10nl_w_list5.wav
Processed audio saved to test_pfstar/chunk_2s/042f07nl_list_ph02.wav
Processed audio saved to test_pfstar/chunk_2s/210f08sp_w_list5a.wav
Processed audio saved to test_pfstar/chunk_2s/190f08sp_w_list5a.wav
Processed audio saved to test_pfstar/chunk_2s/177f09sp_sentences2or7b.wav
Processed audio saved to test_pfstar/chunk_2s/210f08sp_list_5.wav
Processed audio saved to test_pfstar/chunk_2s/00