In [6]:
import os, sys
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..'))) # Para no tener problemas de relative imports with packages
from utils import util

In [7]:
# Functions
def load_raw_audios(dataframe, source, dest_root, split_name):
    '''Copies dataframe-selected audio files from source to dest_root/split_name/'''
    dest = os.path.join(dest_root, split_name)
    os.makedirs(dest, exist_ok=True)

    for _, row in dataframe.iterrows():
        primary_label = row['primary_label']
        filename = row['filename']
        
        source_path = os.path.join(source, primary_label, filename)
        destination_path = os.path.join(dest, filename)

        if os.path.exists(source_path):
            shutil.copy2(source_path, destination_path)

In [8]:
# Paths
df = pd.read_csv(os.path.join('..', 'database', 'meta', 'final_species.csv')) 
birdclef_dir = os.path.join('..', 'data', 'birdclef-2021', 'train_short_audio')
audio_dir = os.path.join('..', 'database', 'audio')

In [9]:
# Load Metadata
dev_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['class_id'],
    random_state=435
)

dev_df.to_csv(os.path.join('..', 'database', 'meta', 'split_dev.csv'), index=False)
test_df.to_csv(os.path.join('..', 'database', 'meta', 'split_test.csv'), index=False)

In [10]:
# Main
print("Resetting raw audio files...")
util.clean_dir(audio_dir)

print("Loading Dev Files...")
load_raw_audios(dev_df, birdclef_dir, audio_dir, split_name='dev')

print("Loading Test Files...")
load_raw_audios(test_df, birdclef_dir, audio_dir, split_name='test')

Resetting raw audio files...
Loading Dev Files...
Loading Test Files...
