In [None]:
import os
import librosa
import soundfile as sf

In [None]:
# configuration
SOURCE_DIR = "data"
OUTPUT_DIR = "data_split"
SAMPLE_RATE = 16000
SEGMENT_DURATION = 2  # seconds
SEGMENT_SAMPLES = SAMPLE_RATE * SEGMENT_DURATION

In [None]:
# FUNCTIONS
# ensure that the path exists
def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

# split audio into segments
def split_audio_to_segments(file_path, segment_samples):
    audio, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    total_segments = len(audio) // segment_samples
    segments = [audio[i * segment_samples:(i + 1) * segment_samples] for i in range(total_segments)]
    return segments

# process a whole directory
def process_directory(source_dir, output_dir):
    for label in os.listdir(source_dir):
        label_path = os.path.join(source_dir, label)
        if not os.path.isdir(label_path):
            continue

        output_label_path = os.path.join(output_dir, label)
        ensure_dir(output_label_path)

        for filename in os.listdir(label_path):
            if not filename.lower().endswith(".wav"):
                continue

            file_path = os.path.join(label_path, filename)
            try:
                segments = split_audio_to_segments(file_path, SEGMENT_SAMPLES)
                for i, segment in enumerate(segments):
                    output_filename = f"{os.path.splitext(filename)[0]}_seg{i}.wav"
                    output_path = os.path.join(output_label_path, output_filename)
                    sf.write(output_path, segment, SAMPLE_RATE)
            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

In [None]:
# Run the split
process_directory(SOURCE_DIR, OUTPUT_DIR)
print(f"Done splitting all audio into {SEGMENT_DURATION}-second segments.")

Done splitting all audio into 2-second segments.
