Note: This is a script for preparing the dataset. Runnning this script requires a animal call database.
Therefore, this script is only for internal debugging purpose. You can create the training / test dataset using your own way.

In [1]:
import os,sys,inspect
sys.path.insert(0, os.path.dirname(os.getcwd()) )
from database_manager import WavDB
import shutil
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import librosa

In [2]:
db_path = "../../data/database/DB.splite"
archive_folder = "../../data/database/archive/"
wav_db = WavDB( db_path = db_path, archive_folder = archive_folder)

In [3]:
# !rm -r data

# Zebseg

## Separate bird

In [3]:
for bird_and_age in ["R3406_035", "R3406_045", "R3406_055",
                     "R3428_039", "R3428_049", "R3428_059",
                     "R3549_043", "R3549_053", "R3549_063",
                     "R3625_045", "R3625_055", "R3625_065",
                     "g17y2", "g4p5", "g19o10", "g19o3"
                    ]:
    bird_name, bird_age = (bird_and_age.split("_")+[""])[:2]
    
    for mode in ["train", "test"]:
    
        dataset_folder = f"data/dataset/zebseg/{bird_and_age}/{mode}"
        os.makedirs(dataset_folder)
        audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
             ( f"collection=='zebseg' AND participant_name=='{bird_name}' AND participant_age=='{bird_age}' AND train_or_test=='{mode}'", [] )  )    

        for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
            assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
            shutil.copy( audio_path, dataset_folder+"/" )
            shutil.copy( csv_path, dataset_folder+"/" )
            
            csv_basename = os.path.basename( csv_path )
            target_csv_path = dataset_folder + "/" + csv_basename
            label_df = pd.read_csv( target_csv_path )
            onset = np.array(label_df["onset"])
            duration = np.array(label_df["duration"])
            offset = onset + duration
            cluster = np.array([0] * len(onset))
            pd.DataFrame(
                    {"onset":onset,
                     "offset":offset,
                     "cluster":cluster
                    }
            ).to_csv( target_csv_path, index = False )

100%|█████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 233.90it/s]
100%|█████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 287.83it/s]
100%|███████████████████████████████████████████████████████| 45/45 [00:00<00:00, 438.03it/s]
100%|█████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 496.30it/s]
100%|█████████████████████████████████████████████████████| 177/177 [00:00<00:00, 433.79it/s]
100%|███████████████████████████████████████████████████████| 19/19 [00:00<00:00, 512.41it/s]
100%|███████████████████████████████████████████████████████| 45/45 [00:00<00:00, 511.48it/s]
100%|█████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 495.41it/s]
100%|███████████████████████████████████████████████████████| 32/32 [00:00<00:00, 468.56it/s]
100%|█████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 338.14it/s]
100%|█████████████████████████████████████████████████████| 

## All Birds

In [3]:
dataset_folder = "data/dataset/zebseg/all_birds/train"
os.makedirs(dataset_folder)
audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
     ( "collection=='zebseg' AND train_or_test=='train'", [] )  )

for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
    assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
    shutil.copy( audio_path, dataset_folder+"/" )
    shutil.copy( csv_path, dataset_folder+"/" )
    
    csv_basename = os.path.basename( csv_path )
    target_csv_path = dataset_folder + "/" + csv_basename
    label_df = pd.read_csv( target_csv_path )
    onset = np.array(label_df["onset"])
    duration = np.array(label_df["duration"])
    offset = onset + duration
    cluster = np.array([0] * len(onset))
    pd.DataFrame(
            {"onset":onset,
             "offset":offset,
             "cluster":cluster
            }
    ).to_csv( target_csv_path, index = False )

100%|███████████████████████████████████████████████████| 2606/2606 [00:06<00:00, 386.02it/s]


In [4]:
dataset_folder = "data/dataset/zebseg/all_birds/test"
os.makedirs(dataset_folder)
audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
     ( "collection=='zebseg' AND train_or_test=='test'", [] )  )

for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
    assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
    shutil.copy( audio_path, dataset_folder+"/" )
    shutil.copy( csv_path, dataset_folder+"/" )
    
    csv_basename = os.path.basename( csv_path )
    target_csv_path = dataset_folder + "/" + csv_basename
    label_df = pd.read_csv( target_csv_path )
    onset = np.array(label_df["onset"])
    duration = np.array(label_df["duration"])
    offset = onset + duration
    cluster = np.array([0] * len(onset))
    pd.DataFrame(
            {"onset":onset,
             "offset":offset,
             "cluster":cluster
            }
    ).to_csv( target_csv_path, index = False )

100%|█████████████████████████████████████████████████████| 283/283 [00:00<00:00, 393.16it/s]


# Canary

In [7]:
wav_anno_list = []
raw_data_folder = "data/raw/Canary/"
for fname in os.listdir(raw_data_folder):
    if fname.endswith(".wav"):
        wav_name = raw_data_folder+fname
        anno_name = wav_name[:-4] + "_Labels.txt"
        if os.path.exists(anno_name):
            wav_anno_list.append((wav_name, anno_name))
wav_anno_list

[('data/raw/Canary/O5P5U-f00035_November_15_2022_07_26_50.wav',
  'data/raw/Canary/O5P5U-f00035_November_15_2022_07_26_50_Labels.txt'),
 ('data/raw/Canary/O5P5U-f00010_November_15_2022_07_15_19.wav',
  'data/raw/Canary/O5P5U-f00010_November_15_2022_07_15_19_Labels.txt'),
 ('data/raw/Canary/O5P5U-f00019_November_15_2022_07_21_37.wav',
  'data/raw/Canary/O5P5U-f00019_November_15_2022_07_21_37_Labels.txt')]

**Note** for the annotation file 'raw/O5P5U-f00035_November_15_2022_07_26_50_Labels.txt', I deleted the last three annotations because they look abnormal.

In [8]:
wav_anno_list.sort( key = lambda x: -len(librosa.load( x[0], sr = 16000 )[0])  )

In [9]:
train_wav_anno_list = wav_anno_list[:2]
test_wav_anno_list = wav_anno_list[2:]

In [10]:
save_data_folder = "data/dataset/Canary/"

In [11]:
try:
    os.makedirs(save_data_folder+"train")
    os.makedirs(save_data_folder+"test")
except:
    pass

In [12]:
save_folder = save_data_folder+"train/"
for wav_name, anno_name in train_wav_anno_list:
    shutil.copy( wav_name, save_folder )
    on_offset_list =[]
    with open(anno_name,"r") as f:
        for line in f:
            try:
                line_split = line.split("\t")
                onset = float(line_split[0])
                offset = float(line_split[1])
                cluster = line_split[2].strip()
                assert offset > onset
            except:
                continue
        
            try:
                assert len(on_offset_list) == 0 or onset>=on_offset_list[-1][1]
            except:
                print("overlapping between segments!")             
                
            on_offset_list.append((onset,offset,cluster))

    onsets, offsets, clusters = list(zip(*on_offset_list))
    dataframe = pd.DataFrame({"onset":onsets, "offset":offsets, "cluster":clusters })
    dataframe.to_csv(save_folder + os.path.basename(wav_name)[:-4]+".csv", index = False )

overlapping between segments!
overlapping between segments!


In [13]:
save_folder = save_data_folder + "test/"
for wav_name, anno_name in test_wav_anno_list:
    shutil.copy( wav_name, save_folder )
    on_offset_list =[]
    with open(anno_name,"r") as f:
        for line in f:
            try:
                line_split = line.split("\t")
                onset = float(line_split[0])
                offset = float(line_split[1])
                cluster = line_split[2].strip()
                assert offset > onset
            except:
                continue
        
            try:
                assert len(on_offset_list) == 0 or onset>=on_offset_list[-1][1]
            except:
                print("overlapping between segments!")

                
            on_offset_list.append((onset,offset,cluster ))

    onsets, offsets, clusters = list(zip(*on_offset_list))
    dataframe = pd.DataFrame({"onset":onsets, "offset":offsets, "cluster":clusters })
    dataframe.to_csv(save_folder + os.path.basename(wav_name)[:-4]+".csv", index = False )

# DAS dataset

## Zebra finch

In [15]:
folder = "data/raw/DAS/zebra_finch/"
csv_train_test = {}
for line in open(folder + "/traintestsplit.txt"):
    line_split = line.split()
    csv_train_test[line_split[0]] = line_split[1]

In [16]:
wav_csv_file_list = [ ]
for fname in os.listdir(folder):
    if fname.endswith(".wav"):
        wav_name = folder + "/" + fname
        csv_name = wav_name[:-4]+"_annotations.csv"
        
        csv_basename = os.path.basename( csv_name )[:-4]
        if not os.path.exists(csv_name) or csv_basename not in csv_train_test:
            continue
        if csv_train_test[csv_basename] == "test":
            train_or_test = "test"
        else:
            train_or_test = "train"
        
        wav_csv_file_list.append( ( wav_name, csv_name, train_or_test ) )

In [18]:
try:
    os.makedirs("data/dataset/DAS/zebra_finch/train")
    os.makedirs("data/dataset/DAS/zebra_finch/test")
except:
    pass

In [19]:
for wav_file_path, csv_file_path, train_or_test in wav_csv_file_list:
    if train_or_test == "train":
        save_path = "data/dataset/DAS/zebra_finch/train/"
    else:
        save_path = "data/dataset/DAS/zebra_finch/test/"
    
    wav_file_basename = os.path.basename( wav_file_path )
    shutil.copy( wav_file_path, save_path + "/" + wav_file_basename )
    
    anno_data = pd.read_csv( csv_file_path )
    
    anno_structured = {}
    anno_structured[ "onset" ] = anno_data["start_seconds"]
    anno_structured[ "offset" ] = anno_data["stop_seconds"]
    anno_structured[ "cluster" ] = anno_data["name"]
    anno_structured = pd.DataFrame( anno_structured )
    assert np.all(anno_structured[ "offset" ] - anno_structured[ "onset" ] >0)
    
    csv_file_basename = wav_file_basename[:-4] + ".csv"
    anno_structured.to_csv( save_path + "/" + csv_file_basename, index = False )