In [1]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

# Create directories if they don't exist
base_dir = "data"
for split in ["train", "valid", "test"]:
    split_dir = os.path.join(base_dir, split)
    os.makedirs(split_dir, exist_ok=True)

# Get all wav files
wav_files = []
for file in os.listdir(base_dir):
    if file.endswith(".wav"):
        wav_files.append(file)

# Create DataFrame
df = pd.DataFrame({"filename": wav_files})

# Perform train-valid-test split
# First split into train and temp (test + valid)
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
# Split temp into valid and test
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Move files to their respective directories
splits = {"train": train_df, "valid": valid_df, "test": test_df}

for split_name, split_df in splits.items():
    for filename in split_df["filename"]:
        src = os.path.join(base_dir, filename)
        dst = os.path.join(base_dir, split_name, filename)
        shutil.move(src, dst)

# Print statistics
print("Dataset split statistics:")
print(f"Train set: {len(train_df)} files ({len(train_df) / len(df) * 100:.1f}%)")
print(f"Validation set: {len(valid_df)} files ({len(valid_df) / len(df) * 100:.1f}%)")
print(f"Test set: {len(test_df)} files ({len(test_df) / len(df) * 100:.1f}%)")

Dataset split statistics:
Train set: 821 files (69.9%)
Validation set: 176 files (15.0%)
Test set: 177 files (15.1%)
