Note: This is a script for preparing the dataset. Runnning this script requires a animal call database.
Therefore, this script is only for internal debugging purpose. You can create the training / test dataset using your own way.

In [1]:
import os,sys,inspect
sys.path.insert(0, os.path.dirname(os.getcwd()) )
from database_manager import WavDB
import shutil
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import librosa

In [2]:
db_path = "../../data/database/DB.splite"
archive_folder = "../../data/database/archive/"
wav_db = WavDB( db_path = db_path, archive_folder = archive_folder)

In [3]:
# !rm -r data

# Zebseg

## Separate bird

In [4]:
for bird_and_age in ["R3406_035", "R3406_045", "R3406_055",
                     "R3428_039", "R3428_049", "R3428_059",
                     "R3549_043", "R3549_053", "R3549_063",
                     "R3625_045", "R3625_055", "R3625_065",
                     "g17y2", "g4p5", "g19o10", "g19o3"
                    ]:
    bird_name, bird_age = (bird_and_age.split("_")+[""])[:2]
    
    for mode in ["train", "test"]:
    
        dataset_folder = f"data/dataset/zebseg/{bird_and_age}/{mode}"
        os.makedirs(dataset_folder)
        audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
             ( f"collection=='zebseg' AND participant_name=='{bird_name}' AND participant_age=='{bird_age}' AND train_or_test=='{mode}'", [] )  )    

        for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
            assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
            shutil.copy( audio_path, dataset_folder+"/" )
            shutil.copy( csv_path, dataset_folder+"/" )
            
            csv_basename = os.path.basename( csv_path )
            target_csv_path = dataset_folder + "/" + csv_basename
            label_df = pd.read_csv( target_csv_path )
            onset = np.array(label_df["onset"])
            duration = np.array(label_df["duration"])
            offset = onset + duration
            cluster = np.array([0] * len(onset))
            pd.DataFrame(
                    {"onset":onset,
                     "offset":offset,
                     "cluster":cluster
                    }
            ).to_csv( target_csv_path, index = False )

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 320.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.02it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 314.35it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 338.53it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 177/177 [00:00<00:00, 265.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 335.96it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 337.84it/s]
100%|█

## All Birds

In [5]:
dataset_folder = "data/dataset/zebseg/all_birds/train"
os.makedirs(dataset_folder)
audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
     ( "collection=='zebseg' AND train_or_test=='train'", [] )  )

for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
    assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
    shutil.copy( audio_path, dataset_folder+"/" )
    shutil.copy( csv_path, dataset_folder+"/" )
    
    csv_basename = os.path.basename( csv_path )
    target_csv_path = dataset_folder + "/" + csv_basename
    label_df = pd.read_csv( target_csv_path )
    onset = np.array(label_df["onset"])
    duration = np.array(label_df["duration"])
    offset = onset + duration
    cluster = np.array([0] * len(onset))
    pd.DataFrame(
            {"onset":onset,
             "offset":offset,
             "cluster":cluster
            }
    ).to_csv( target_csv_path, index = False )

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 2606/2606 [00:04<00:00, 603.26it/s]


In [6]:
dataset_folder = "data/dataset/zebseg/all_birds/test"
os.makedirs(dataset_folder)
audio_path_list, label_path_list = wav_db.get_audio_and_label_paths( 
     ( "collection=='zebseg' AND train_or_test=='test'", [] )  )

for audio_path, csv_path in tqdm(zip(audio_path_list, label_path_list), total=len(audio_path_list)):
    assert audio_path.endswith(".wav") and csv_path.endswith(".csv")
    shutil.copy( audio_path, dataset_folder+"/" )
    shutil.copy( csv_path, dataset_folder+"/" )
    
    csv_basename = os.path.basename( csv_path )
    target_csv_path = dataset_folder + "/" + csv_basename
    label_df = pd.read_csv( target_csv_path )
    onset = np.array(label_df["onset"])
    duration = np.array(label_df["duration"])
    offset = onset + duration
    cluster = np.array([0] * len(onset))
    pd.DataFrame(
            {"onset":onset,
             "offset":offset,
             "cluster":cluster
            }
    ).to_csv( target_csv_path, index = False )

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 283/283 [00:00<00:00, 613.52it/s]


# Canary

In [7]:
wav_anno_list = []
raw_data_folder = "data/raw/Canary/"
for fname in os.listdir(raw_data_folder):
    if fname.endswith(".wav"):
        wav_name = raw_data_folder+fname
        anno_name = wav_name[:-4] + "_Labels.txt"
        if os.path.exists(anno_name):
            wav_anno_list.append((wav_name, anno_name))
wav_anno_list

[('data/raw/Canary/O5P5U-f00035_November_15_2022_07_26_50.wav',
  'data/raw/Canary/O5P5U-f00035_November_15_2022_07_26_50_Labels.txt'),
 ('data/raw/Canary/O5P5U-f00010_November_15_2022_07_15_19.wav',
  'data/raw/Canary/O5P5U-f00010_November_15_2022_07_15_19_Labels.txt'),
 ('data/raw/Canary/O5P5U-f00019_November_15_2022_07_21_37.wav',
  'data/raw/Canary/O5P5U-f00019_November_15_2022_07_21_37_Labels.txt')]

**Note** for the annotation file 'raw/O5P5U-f00035_November_15_2022_07_26_50_Labels.txt', I deleted the last three annotations because they look abnormal.

In [8]:
wav_anno_list.sort( key = lambda x: -len(librosa.load( x[0], sr = 16000 )[0])  )

In [9]:
train_wav_anno_list = wav_anno_list[:2]
test_wav_anno_list = wav_anno_list[2:]

In [10]:
save_data_folder = "data/dataset/Canary/"

In [11]:
try:
    os.makedirs(save_data_folder+"train")
    os.makedirs(save_data_folder+"test")
except:
    pass

In [12]:
save_folder = save_data_folder+"train/"
for wav_name, anno_name in train_wav_anno_list:
    shutil.copy( wav_name, save_folder )
    on_offset_list =[]
    with open(anno_name,"r") as f:
        for line in f:
            try:
                line_split = line.split("\t")
                onset = float(line_split[0])
                offset = float(line_split[1])
                cluster = line_split[2].strip()
                assert offset > onset
            except:
                continue
        
            try:
                assert len(on_offset_list) == 0 or onset>=on_offset_list[-1][1]
            except:
                print("overlapping between segments!")             
                
            on_offset_list.append((onset,offset,cluster))

    onsets, offsets, clusters = list(zip(*on_offset_list))
    dataframe = pd.DataFrame({"onset":onsets, "offset":offsets, "cluster":clusters })
    dataframe.to_csv(save_folder + os.path.basename(wav_name)[:-4]+".csv", index = False )

overlapping between segments!
overlapping between segments!


In [13]:
save_folder = save_data_folder + "test/"
for wav_name, anno_name in test_wav_anno_list:
    shutil.copy( wav_name, save_folder )
    on_offset_list =[]
    with open(anno_name,"r") as f:
        for line in f:
            try:
                line_split = line.split("\t")
                onset = float(line_split[0])
                offset = float(line_split[1])
                cluster = line_split[2].strip()
                assert offset > onset
            except:
                continue
        
            try:
                assert len(on_offset_list) == 0 or onset>=on_offset_list[-1][1]
            except:
                print("overlapping between segments!")

                
            on_offset_list.append((onset,offset,cluster ))

    onsets, offsets, clusters = list(zip(*on_offset_list))
    dataframe = pd.DataFrame({"onset":onsets, "offset":offsets, "cluster":clusters })
    dataframe.to_csv(save_folder + os.path.basename(wav_name)[:-4]+".csv", index = False )

# DAS dataset

## Zebra finch

In [15]:
folder = "data/raw/DAS/zebra_finch/"
csv_train_test = {}
for line in open(folder + "/traintestsplit.txt"):
    line_split = line.split()
    csv_train_test[line_split[0]] = line_split[1]

In [16]:
wav_csv_file_list = [ ]
for fname in os.listdir(folder):
    if fname.endswith(".wav"):
        wav_name = folder + "/" + fname
        csv_name = wav_name[:-4]+"_annotations.csv"
        
        csv_basename = os.path.basename( csv_name )[:-4]
        if not os.path.exists(csv_name) or csv_basename not in csv_train_test:
            continue
        if csv_train_test[csv_basename] == "test":
            train_or_test = "test"
        else:
            train_or_test = "train"
        
        wav_csv_file_list.append( ( wav_name, csv_name, train_or_test ) )

In [18]:
try:
    os.makedirs("data/dataset/DAS/zebra_finch/train")
    os.makedirs("data/dataset/DAS/zebra_finch/test")
except:
    pass

In [19]:
for wav_file_path, csv_file_path, train_or_test in wav_csv_file_list:
    if train_or_test == "train":
        save_path = "data/dataset/DAS/zebra_finch/train/"
    else:
        save_path = "data/dataset/DAS/zebra_finch/test/"
    
    wav_file_basename = os.path.basename( wav_file_path )
    shutil.copy( wav_file_path, save_path + "/" + wav_file_basename )
    
    anno_data = pd.read_csv( csv_file_path )
    
    anno_structured = {}
    anno_structured[ "onset" ] = anno_data["start_seconds"]
    anno_structured[ "offset" ] = anno_data["stop_seconds"]
    anno_structured[ "cluster" ] = anno_data["name"]
    anno_structured = pd.DataFrame( anno_structured )
    assert np.all(anno_structured[ "offset" ] - anno_structured[ "onset" ] >0)
    
    csv_file_basename = wav_file_basename[:-4] + ".csv"
    anno_structured.to_csv( save_path + "/" + csv_file_basename, index = False )

# Meerkat dataset

In [1]:
import shutil
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import librosa
from glob import glob
import os
from io import StringIO
import soundfile as sf

In [2]:
audio_file_list = glob("data/raw/Meerkat/meerkat/*.wav") + glob("data/raw/Meerkat/meerkat/*.WAV")
csv_file_list = []
for audio_file in audio_file_list:
    if os.path.exists( audio_file[:-3] + "csv" ):
        csv_file_list.append( audio_file[:-3] + "csv" )
    else:
        assert os.path.exists( audio_file[:-3] + "CSV" )
        csv_file_list.append( audio_file[:-3] + "CSV" )
assert len(audio_file_list) == len(csv_file_list) 
list(zip( audio_file_list, csv_file_list ))

[('data/raw/Meerkat/meerkat/ZIP040_SingleCallTypesMerged_2022_ML_03.wav',
  'data/raw/Meerkat/meerkat/ZIP040_SingleCallTypesMerged_2022_ML_03.csv'),
 ('data/raw/Meerkat/meerkat/ZIP040_2022_ML_01.wav',
  'data/raw/Meerkat/meerkat/ZIP040_2022_ML_01.csv'),
 ('data/raw/Meerkat/meerkat/ZIP040_2022_ML_02.wav',
  'data/raw/Meerkat/meerkat/ZIP040_2022_ML_02.csv'),
 ('data/raw/Meerkat/meerkat/VALP009_AL_5_15DEC2022_MF_ML.WAV',
  'data/raw/Meerkat/meerkat/VALP009_AL_5_15DEC2022_MF_ML.csv'),
 ('data/raw/Meerkat/meerkat/VLM298_L_4_27DEC2022_MF_ML.WAV',
  'data/raw/Meerkat/meerkat/VLM298_L_4_27DEC2022_MF_ML.csv'),
 ('data/raw/Meerkat/meerkat/VALP007_AL_6_15DEC2022_MF_ML.WAV',
  'data/raw/Meerkat/meerkat/VALP007_AL_6_15DEC2022_MF_ML.csv')]

In [3]:
clean_audio_list = []
for audio_file in audio_file_list:
    audio, sr = librosa.load( audio_file, mono=False )
    clean_audio_list.append(
        {
            "audio":audio[0],
            "sr":sr
        }
    )

In [4]:
anno_list = []
for csv_file in csv_file_list:
    lines = []
    for line in open(csv_file,"r").readlines():
        line = line.replace('"','')
        lines.append(line)
    csv_data = "\n".join( lines )
    df = pd.read_csv(StringIO(csv_data), sep='\t')
    
    anno_list.append(df)

In [5]:
def decimal_to_seconds( decimal_time ):
    splits = decimal_time.split(":")
    if len(splits) == 2:
        hours = 0
        minutes, seconds = splits
    elif len(splits) == 3:
        hours, minutes, seconds = splits
    else:
        assert False
    
    return int(hours) * 3600 + int(minutes) * 60 + float(seconds)

In [6]:
clean_anno_list = []
for anno in anno_list:
    onset_list = []
    offset_list = []
    cluster_list = []
    for onset, duration in zip( anno["Start"], anno["Duration"] ):
        onset_list.append( decimal_to_seconds( onset ) )
        offset_list.append( decimal_to_seconds( onset ) + decimal_to_seconds( duration ) )
        cluster_list.append( "0" )
    
    clean_anno_list.append(
        pd.DataFrame(
            {
                "onset":onset_list,
                "offset":offset_list,
                "cluster":cluster_list
            }
        )
    )

In [7]:
np.random.seed(0)

train_corpus = []
test_corpus = []

for idx in range(len(clean_audio_list)):
    audio = clean_audio_list[idx]["audio"]
    sr = clean_audio_list[idx]["sr"]
    anno = clean_anno_list[idx]
    
    fname = os.path.basename( audio_file_list[idx] )[:-4]
    
    ## use either the first 10% or the last 10% of the recording for testing
    if np.random.choice(2) == 0:
        ratio = 0.1
    else:
        ratio = 0.9
        
    split_point = int( len( audio ) * ratio )
    split_time = split_point / sr
    
    audio1 = audio[:split_point]
    audio2 = audio[split_point:]
    
    anno1 = anno[ anno["onset"] < split_time ].copy()
    anno1["offset"] = np.minimum( anno1["offset"], split_time )
    
    anno2 = anno[ anno["offset"] > split_time ].copy()
    anno2["onset"] = np.maximum( anno2["onset"] - split_time, 0.0 )
    anno2["offset"] = anno2["offset"] - split_time
    
    
    if ratio == 0.1:
        train_audio = audio2
        train_anno = anno2
        test_audio = audio1
        test_anno = anno1
    else:
        train_audio = audio1
        train_anno = anno1
        test_audio = audio2
        test_anno = anno2
        
    train_corpus.append(
            {
                "audio":train_audio,
                "annotation":train_anno,
                "sr":sr,
                "name":fname
            }
    )
    test_corpus.append(
            {
                "audio":test_audio,
                "annotation":test_anno,
                "sr":sr,
                "name":fname
            }
    )

In [8]:
os.makedirs("data/dataset/meerkat/train", exist_ok=True)
os.makedirs("data/dataset/meerkat/test", exist_ok=True)

In [9]:
for example in train_corpus:
    audio_name = "data/dataset/meerkat/train/" + example["name"] + ".wav"
    csv_name = "data/dataset/meerkat/train/" + example["name"] + ".csv"
    
    sf.write(audio_name, example["audio"], example["sr"], 'PCM_24')
    example["annotation"].to_csv( csv_name, index = False )    

In [10]:
for example in test_corpus:
    audio_name = "data/dataset/meerkat/test/" + example["name"] + ".wav"
    csv_name = "data/dataset/meerkat/test/" + example["name"] + ".csv"
    
    sf.write(audio_name, example["audio"], example["sr"], 'PCM_24')
    example["annotation"].to_csv( csv_name, index = False )    