In [19]:
from avgn.downloading.download import download_tqdm
from avgn.utils.paths import DATA_DIR
from avgn.utils.general import unzip_file
from tqdm.autonotebook import tqdm
import librosa

In [2]:
# # where the files are located online (url, filename)
# data_urls = [
#     ('https://ndownloader.figshare.com/articles/3470165/versions/1', 'all_files.zip'),
# ]
# # where to save the files
output_loc = DATA_DIR/"raw/sparse/"

In [3]:
# # download the files locally
# for url, filename in data_urls:
#     # print(url, filename)
#     download_tqdm(url, output_location=output_loc/filename)

In [4]:
# # list the downloaded files
# zip_files = list((output_loc/"zip_contents").glob('*.zip'))
# zip_files[:2]

In [5]:
# # unzip the files
# for zf in tqdm(zip_files):
#     unzip_file(zf, output_loc/"zip_contents")

In [6]:
from datetime import datetime
import avgn.utils
import numpy as np

In [7]:
RAW_DATASET_LOC = output_loc/"zip_contents"
# RAW_DATASET_LOC

PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents')

In [8]:
# first we create a name for our dataset
DATASET_ID = 'sparse_finch'

# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [9]:
# grab a list of all the raw waveforms
wav_list = list(RAW_DATASET_LOC.glob('Bird*/Wave/*.wav'))
len(wav_list), np.sort(wav_list)[-2:]

(2965,
 array([PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents/Bird9/Wave/98.wav'),
        PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents/Bird9/Wave/99.wav')],
       dtype=object))

In [10]:
# grab a list of all of the raw annotation files for each bird
annotation_files = list(RAW_DATASET_LOC.glob('Bird*/Annotation.xml'))
len(annotation_files), np.sort(annotation_files)[-2:]

(11,
 array([PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents/Bird8/Annotation.xml'),
        PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents/Bird9/Annotation.xml')],
       dtype=object))

#### Now, for each wav file, we want to generate a JSON, using information from the XML.

Lets take a look inside an XML first, to see what's in there. It might be useful to take a look at this XML file in your web browser to get a better idea of what's in there as well.

In [11]:
import xml.etree.ElementTree
import xml.dom.minidom

In [12]:
# print a sample of the XML
parssed  = xml.dom.minidom.parse(annotation_files[0].as_posix()) 
pretty_xml_as_string = parssed.toprettyxml()
# print(pretty_xml_as_string[:400] + '...')

In [13]:
import pandas as pd

In [14]:
song_df = pd.DataFrame(
        columns=[
            "bird",
            "WaveFileName",
            "Position",
            "Length",
            "NumNote",
            "NotePositions",
            "NoteLengths",
            "NoteLabels",
        ]
    )
song_df

Unnamed: 0,bird,WaveFileName,Position,Length,NumNote,NotePositions,NoteLengths,NoteLabels


In [15]:
# loop through XML annotation files
for bird_loc in tqdm(annotation_files):
    # grab the
    bird_xml = xml.etree.ElementTree.parse(bird_loc).getroot()
    bird = bird_loc.parent.stem
    # loop through each "sequence" in the datset (corresponding to a bout)
    # for element in tqdm(bird_xml.getchildren(), leave=False):
    for element in tqdm(list(bird_xml)):
        if element.tag == "Sequence":
            notePositions = []
            noteLengths = []
            noteLabels = []
            # get the metadata for that sequence 
            # for seq_element in element.getchildren():
            for seq_element in list(element):
                if seq_element.tag == "Position":
                    position = seq_element.text
                elif seq_element.tag == "Length":
                    length = seq_element.text
                elif seq_element.tag == "WaveFileName":
                    WaveFileName = seq_element.text
                elif seq_element.tag == "NumNote":
                    NumNote = seq_element.text
                # get the metadata for the note
                elif seq_element.tag == "Note":
                    # for note_element in seq_element.getchildren():
                    for note_element in list(seq_element):
                        if note_element.tag == "Label":
                            noteLabels.append(note_element.text)
                        elif note_element.tag == "Position":
                            notePositions.append(note_element.text)
                        elif note_element.tag == "Length":
                            noteLengths.append(note_element.text)
            # add to the pandas dataframe
            song_df.loc[len(song_df)] = [
                bird,
                WaveFileName,
                position,
                length,
                NumNote,
                notePositions,
                noteLengths,
                noteLabels,
            ]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/412 [00:00<?, ?it/s]

  0%|          | 0/1351 [00:00<?, ?it/s]

  0%|          | 0/2110 [00:00<?, ?it/s]

  0%|          | 0/1964 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/1486 [00:00<?, ?it/s]

  0%|          | 0/1495 [00:00<?, ?it/s]

  0%|          | 0/1854 [00:00<?, ?it/s]

  0%|          | 0/419 [00:00<?, ?it/s]

  0%|          | 0/1238 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

### Now we can generate a JSON from that pandas dataframe

In [21]:
from avgn.utils.audio import get_samplerate
import librosa
from avgn.utils.json import NoIndent, NoIndentEncoder
import json

In [24]:
# for each bird
for bird in tqdm(np.unique(song_df.bird)):
    # grab that bird's annotations
    bird_df = song_df[song_df.bird == bird]
    
    # for each wav file produced by that bird
    for wfn in tqdm(bird_df.WaveFileName.unique(), leave=False):
        
        wfn_df = bird_df[bird_df.WaveFileName == wfn]
        
        # get the location of the wav
        wav_loc = RAW_DATASET_LOC / bird / "Wave" / wfn
    
        # get the wav samplerate and duration
        try:
            sr = get_samplerate(wav_loc.as_posix()) # this function often has errors
        except:
            # sr = 22050
            thing, sr = librosa.load(wav_loc)
        wav_duration = librosa.get_duration(filename=wav_loc)
        
        # make json dictionary
        json_dict = {}
        # add species
        json_dict["species"] = "Lonchura striata domestica"
        json_dict["common_name"] = "Bengalese finch"
        json_dict["wav_loc"] = wav_loc.as_posix()
        # rate and length
        json_dict["samplerate_hz"] = sr
        json_dict["length_s"] = wav_duration
        
        # make a dataframe of wav info
        seq_df = pd.DataFrame(
            (
                [
                    [
                        list(np.repeat(sequence_num, len(row.NotePositions))),
                        list(row.NoteLabels),
                        np.array(
                            (np.array(row.NotePositions).astype("int") + int(row.Position))
                            / sr
                        ).astype("float64"),
                        np.array(
                            (
                                np.array(row.NotePositions).astype("int")
                                + np.array(row.NoteLengths).astype("int")
                                + int(row.Position)
                            )
                            / sr
                        ).astype("float64"),
                    ]
                    for sequence_num, (idx, row) in enumerate(wfn_df.iterrows())
                ]
            ),
            columns=["sequence_num", "labels", "start_times", "end_times"],
        )
        
        # add syllable information
        json_dict["indvs"] = {
            bird: {
                "notes": {
                    "start_times": NoIndent(
                        list(np.concatenate(seq_df.start_times.values))
                    ),
                    "end_times": NoIndent(list(np.concatenate(seq_df.end_times.values))),
                    "labels": NoIndent(list(np.concatenate(seq_df.labels.values))),
                    "sequence_num": NoIndent(
                        [int(i) for i in np.concatenate(seq_df.sequence_num.values)]
                    ),
                }
            }
        }
        
        
        # dump dict into json format
        json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)

        wav_stem = bird + "_" + wfn.split(".")[0]
        json_out = (
            DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")
        )

        # save json
        avgn.utils.paths.ensure_dir(json_out.as_posix())
        print(json_txt, file=open(json_out.as_posix(), "w"))
        

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

	This alias will be removed in version 1.0.
  wav_duration = librosa.get_duration(filename=wav_loc)


  0%|          | 0/315 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/335 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/217 [00:00<?, ?it/s]

In [25]:
print(json_txt) # example of json needed for each song, with start times for each syllable

{
  "species": "Lonchura striata domestica",
  "common_name": "Bengalese finch",
  "wav_loc": "/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/sparse/zip_contents/Bird9/Wave/216.wav",
  "samplerate_hz": 22050,
  "length_s": 11.122358276643991,
  "indvs": {
    "Bird9": {
      "notes": {
        "start_times": [1.6805442176870748, 1.8895238095238096, 2.105759637188209, 2.329251700680272, 2.555646258503401, 2.786394557823129, 3.03891156462585, 3.2551473922902496, 3.488798185941043, 3.719546485260771, 3.937233560090703, 4.1302494331065756, 4.311655328798186, 4.4727437641723355, 4.581587301587302, 4.733968253968254, 4.893605442176871, 5.060498866213152, 5.224489795918367, 5.3913832199546485, 5.5626303854875285, 5.730975056689342, 5.903673469387755, 6.011065759637188, 6.1663492063492065, 6.333242630385487, 6.492879818594104, 6.65687074829932, 6.817959183673469, 6.977596371882086, 7.134331065759637, 7.301224489795918, 7.4681179138322, 7.655328798185941, 7.836734693877551, 8.050068027210884,