In [27]:
import os

In [18]:

# Define paths
base_path = "../data/"
mel_path = os.path.join(base_path, "mels-27class/")
train_path = os.path.join(base_path, "train/")
valid_path = os.path.join(base_path, "valid/")
test_path = os.path.join(base_path, "test/")

In [26]:
import shutil
import os
import random
from sklearn.model_selection import train_test_split

def split_data(base_path):
    mel_path = os.path.join(base_path, "mels-27class/")
    train_path = os.path.join(base_path, "train/")
    valid_path = os.path.join(base_path, "valid/")
    test_path = os.path.join(base_path, "test/")

    # Create train, valid, and test directories if they don't exist
    for path in [train_path, valid_path, test_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    def move_files(file_list, target_directory):
        # Create target directory if it doesn't exist
        if not os.path.exists(target_directory):
            os.makedirs(target_directory)
        for file in file_list:
            src_file = os.path.join(species_mel_path, audio_dir, file)
            dest_file = os.path.join(target_directory, file)
            try:
                shutil.move(src_file, dest_file)
                print("Moved", src_file, "to", dest_file)
            except FileNotFoundError as e:
                print(f"FileNotFoundError: {e}")
                continue

    # Get list of species directories
    species_dirs = [d for d in os.listdir(mel_path) if os.path.isdir(os.path.join(mel_path, d))]

    for species_dir in species_dirs:
        print("Species:", species_dir)
        species_mel_path = os.path.join(mel_path, species_dir)
        
        # Get list of audio directories for each species
        audio_dirs = [d for d in os.listdir(species_mel_path) if os.path.isdir(os.path.join(species_mel_path, d))]

        for audio_dir in audio_dirs:
            print("Audio Directory:", audio_dir)
            audio_files = [f for f in os.listdir(os.path.join(species_mel_path, audio_dir)) if f.endswith('.png')]
            
            # Check if we have enough files to split
            if len(audio_files) < 3:
                print(f"Not enough files in {species_dir}/{audio_dir}. Skipping...")
                continue

            random.shuffle(audio_files)

            # Split data
            train_ratio = 0.6
            valid_ratio = 0.2
            test_ratio = 0.2

            X_train, X_temp = train_test_split(audio_files, test_size=1 - train_ratio, random_state=42)
            X_valid, X_test = train_test_split(X_temp, test_size=test_ratio / (test_ratio + valid_ratio), random_state=42)

            # Copy files to respective directories
            move_files(X_train, os.path.join(train_path, species_dir, audio_dir))
            move_files(X_valid, os.path.join(valid_path, species_dir, audio_dir))
            move_files(X_test, os.path.join(test_path, species_dir, audio_dir))

    print("Dataset splitting completed.")

# Call the function with the base path
base_path = "../data/"
split_data(base_path)


Species: Delichonurbicum
Audio Directory: 652535
Moved ../data/mels-27class/Delichonurbicum/652535/652535_3.png to ../data/train/Delichonurbicum/652535/652535_3.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_1.png to ../data/train/Delichonurbicum/652535/652535_1.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_7.png to ../data/train/Delichonurbicum/652535/652535_7.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_2.png to ../data/train/Delichonurbicum/652535/652535_2.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_6.png to ../data/valid/Delichonurbicum/652535/652535_6.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_5.png to ../data/test/Delichonurbicum/652535/652535_5.png
Moved ../data/mels-27class/Delichonurbicum/652535/652535_4.png to ../data/test/Delichonurbicum/652535/652535_4.png
Audio Directory: 344098
Moved ../data/mels-27class/Delichonurbicum/344098/344098_4.png to ../data/train/Delichonurbicum/344098/344098_4.png
M

In [28]:
import os
import random
from sklearn.model_selection import train_test_split
import shutil

def copy_files(file_list, target_directory):
    os.makedirs(target_directory, exist_ok=True)
    for file in file_list:
        shutil.copy(file, os.path.join(target_directory, file.split('/')[-1]))

def split_data(base_path, k_folds=5, random_state=42):
    # List all species directories
    species_dirs = os.listdir(os.path.join(base_path, "mels-27class"))

    for species_dir in species_dirs:
        audio_files = []
        species_path = os.path.join(base_path, "mels-27class", species_dir)

        # Get all audio files for this species
        for subdir, _, files in os.walk(species_path):
            audio_files.extend([os.path.join(subdir, f) for f in files if f.endswith('.png')])

        # Perform k-fold split
        fold_size = len(audio_files) // k_folds
        for fold in range(k_folds):
            fold_start = fold * fold_size
            fold_end = (fold + 1) * fold_size

            fold_test_files = audio_files[fold_start:fold_end]
            fold_train_files = audio_files[:fold_start] + audio_files[fold_end:]

            fold_train_dir = os.path.join(base_path, "kfold_train_{}".format(fold), species_dir)
            fold_test_dir = os.path.join(base_path, "kfold_test_{}".format(fold), species_dir)

            copy_files(fold_train_files, fold_train_dir)
            copy_files(fold_test_files, fold_test_dir)

            print("Fold {} - Train Size: {}, Test Size: {}".format(fold, len(fold_train_files), len(fold_test_files)))

    print("K-Fold Cross Validation completed.")

# Call the function with the base path and number of folds
base_path = "../data/"
split_data(base_path, k_folds=5)


Fold 0 - Train Size: 0, Test Size: 0
Fold 1 - Train Size: 0, Test Size: 0
Fold 2 - Train Size: 0, Test Size: 0
Fold 3 - Train Size: 0, Test Size: 0
Fold 4 - Train Size: 0, Test Size: 0
Fold 0 - Train Size: 8, Test Size: 1
Fold 1 - Train Size: 8, Test Size: 1
Fold 2 - Train Size: 8, Test Size: 1
Fold 3 - Train Size: 8, Test Size: 1
Fold 4 - Train Size: 8, Test Size: 1
Fold 0 - Train Size: 1, Test Size: 0
Fold 1 - Train Size: 1, Test Size: 0
Fold 2 - Train Size: 1, Test Size: 0
Fold 3 - Train Size: 1, Test Size: 0
Fold 4 - Train Size: 1, Test Size: 0
Fold 0 - Train Size: 1, Test Size: 0
Fold 1 - Train Size: 1, Test Size: 0
Fold 2 - Train Size: 1, Test Size: 0
Fold 3 - Train Size: 1, Test Size: 0
Fold 4 - Train Size: 1, Test Size: 0
Fold 0 - Train Size: 7, Test Size: 1
Fold 1 - Train Size: 7, Test Size: 1
Fold 2 - Train Size: 7, Test Size: 1
Fold 3 - Train Size: 7, Test Size: 1
Fold 4 - Train Size: 7, Test Size: 1
Fold 0 - Train Size: 0, Test Size: 0
Fold 1 - Train Size: 0, Test Size: 0
F