In [10]:
import os
import io
import wave
import librosa
import soundfile as sf
import noisereduce as nr
import webrtcvad
from pydub import AudioSegment, silence

In [12]:
def preprocess_flac_files(
    input_folder: str,
    output_folder: str,
    min_silence_len: int = 400,
    silence_thresh: int = -50,
    keep_silence: int = 200,
    vad_aggressiveness: int = 3,
    target_sr: int = 16000
):
    """
    Processes a two-level folder structure of FLAC files:
      - Splits on silence,
      - (Optionally) applies WebRTC Voice Activity Detection,
      - Noise reduction,
      - Volume normalization,
      - Resamples to target_sr,
      - Saves the processed audio with the same folder structure.

    Args:
        input_folder (str): Root directory containing user folders (1..150).
        output_folder (str): Where to save processed files (mirroring structure).
        min_silence_len (int): Minimum length of silence (ms) for pydub splitting.
        silence_thresh (int): Silence threshold in dBFS for pydub.
        keep_silence (int): How many ms of silence to keep at each split edge.
        vad_aggressiveness (int): 0-3 (webrtcvad aggressiveness; higher = more aggressive).
        target_sr (int): Target sample rate for final audio.
    """
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over each user folder (e.g. "1", "2", ..., "150")
    for user_dir in os.listdir(input_folder):
        user_input_path = os.path.join(input_folder, user_dir)
        if not os.path.isdir(user_input_path):
            continue  # skip anything that's not a directory

        # Create matching directory in the output folder
        user_output_path = os.path.join(output_folder, user_dir)
        os.makedirs(user_output_path, exist_ok=True)

        # Process each .flac file in this user's folder
        for filename in os.listdir(user_input_path):
            if filename.lower().endswith(".flac"):
                file_path = os.path.join(user_input_path, filename)
                
                # -------------------------------------------------
                # 1) READ FLAC with pydub as AudioSegment
                # -------------------------------------------------
                audio_segment = AudioSegment.from_file(file_path, format="flac")
                print(f"Loaded: {file_path} [dBFS={audio_segment.dBFS:.2f}]")

                # -------------------------------------------------
                # 2) SPLIT BY SILENCE (pydub)
                # -------------------------------------------------
                # This splits the audio into chunks where each chunk is separated
                # by >= min_silence_len ms of silence at < silence_thresh dBFS.
                # keep_silence (ms) keeps a bit of silence at edges for continuity.
                chunks = silence.split_on_silence(
                    audio_segment,
                    min_silence_len=min_silence_len,
                    silence_thresh=silence_thresh,
                    keep_silence=keep_silence
                )

                # If no chunks found (completely silent?), just treat the entire file as one chunk
                if not chunks:
                    chunks = [audio_segment]

                # -------------------------------------------------
                # 3) PROCESS EACH CHUNK
                # -------------------------------------------------
                for i, chunk in enumerate(chunks, start=1):
                    # (Optional) Convert chunk to PCM for WebRTC VAD
                    # Note: pydub chunk sample_width might be 2 or 4 bytes, etc.
                    # We need 16-bit PCM for webrtcvad, with sample rates 8k, 16k, 32k, or 48k.
                    chunk_frame_rate = chunk.frame_rate
                    chunk = chunk.set_frame_rate(target_sr).set_channels(1).set_sample_width(2)
                    chunk_frame_rate = chunk.frame_rate  # Now should be target_sr

                    # Optionally run WebRTC VAD to further trim non-speech
                    # If your data is already well-split by silence, you can skip this step.
                    chunk = apply_webrtc_vad(chunk, vad_aggressiveness)

                    # Skip empty chunks after VAD
                    if len(chunk) < 10:  
                        continue

                    # Convert chunk (pydub) back to numpy for noisereduce + librosa
                    buffer = io.BytesIO()
                    chunk.export(buffer, format="wav")  # Export chunk as WAV in memory
                    buffer.seek(0)
                    audio_data, _ = sf.read(buffer, dtype='float32')
                    
                    # -------------------------------------------------
                    # 4) NOISE REDUCTION (noisereduce)
                    # -------------------------------------------------
                    reduced_audio = nr.reduce_noise(y=audio_data, sr=target_sr)

                    # -------------------------------------------------
                    # 5) VOLUME NORMALIZATION (librosa)
                    # -------------------------------------------------
                    normalized_audio = librosa.util.normalize(reduced_audio)

                    # -------------------------------------------------
                    # 6) RESAMPLE to target_sr (already at target_sr, but just in case)
                    # -------------------------------------------------
                    # If you want to absolutely ensure sampling rate:
                    final_audio = librosa.resample(normalized_audio, orig_sr=target_sr, target_sr=target_sr)

                    # -------------------------------------------------
                    # 7) SAVE the processed chunk
                    # -------------------------------------------------
                    base_name = os.path.splitext(filename)[0]  # e.g. audio_1
                    output_filename = f"{base_name}_chunk_{i}_cleaned.wav"
                    output_file_path = os.path.join(user_output_path, output_filename)

                    sf.write(output_file_path, final_audio, target_sr)
                    print(f"  -> Saved chunk {i}: {output_file_path}")

In [13]:
def apply_webrtc_vad(audio_segment: AudioSegment, aggressiveness: int = 3) -> AudioSegment:
    """
    Applies WebRTC VAD to a mono, 16-bit AudioSegment at 8/16/32/48 kHz.
    Trims out frames flagged as non-voice. 
    Returns a smaller AudioSegment containing only voiced frames.

    NOTE: pydub's chunk must already be set to 16-bit, 1 channel, 
    and a supported sample rate before calling this function.
    """
    vad = webrtcvad.Vad()
    vad.set_mode(aggressiveness)

    sample_rate = audio_segment.frame_rate
    raw_data = audio_segment.raw_data
    bytes_per_frame = 2  # since we set sample_width=2

    # WebRTC VAD expects frames of 10, 20, or 30 ms
    frame_ms = 30  
    frame_size = int(sample_rate * (frame_ms / 1000.0))  # samples per frame

    voiced_frames = bytearray()

    # Process the raw_data in chunks of 'frame_size * bytes_per_frame'
    for start in range(0, len(raw_data), frame_size * bytes_per_frame):
        end = start + (frame_size * bytes_per_frame)
        if end > len(raw_data):
            break
        frame = raw_data[start:end]
        # VAD check
        is_speech = vad.is_speech(frame, sample_rate)
        if is_speech:
            voiced_frames.extend(frame)

    # Build a new AudioSegment from the voiced frames
    voiced_segment = AudioSegment(
        data=bytes(voiced_frames),
        sample_width=audio_segment.sample_width,
        frame_rate=sample_rate,
        channels=1
    )
    return voiced_segment


In [14]:
if __name__ == "__main__":
    input_folder = r"C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\differentPhrase" 
    output_folder = r"C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output"
    
    preprocess_flac_files(
        input_folder=input_folder,
        output_folder=output_folder,
        min_silence_len=400,    # pydub: ms of silence
        silence_thresh=-50,     # pydub: dBFS threshold
        keep_silence=200,       # pydub: keep 200ms
        vad_aggressiveness=3,   # webrtcvad: 0-3
        target_sr=16000         # final sample rate
    )

Loaded: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\differentPhrase\1\1-11.flac [dBFS=-26.00]
  -> Saved chunk 1: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output\1\1-11_chunk_1_cleaned.wav
  -> Saved chunk 2: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output\1\1-11_chunk_2_cleaned.wav
Loaded: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\differentPhrase\1\1-12.flac [dBFS=-25.70]
  -> Saved chunk 1: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output\1\1-12_chunk_1_cleaned.wav
  -> Saved chunk 2: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output\1\1-12_chunk_2_cleaned.wav
Loaded: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\differentPhrase\1\1-13.flac [dBFS=-25.83]
  -> Saved chunk 1: C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output\1\1-13_