In [1]:
from avgn.downloading.download import download_tqdm
from avgn.utils.paths import DATA_DIR
from avgn.utils.general import unzip_file
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [4]:
# # where the files are located online (url, filename)
# data_urls = [
#     ('https://ndownloader.figshare.com/articles/3470165/versions/1', 'all_files.zip'),
# ]
# # where to save the files
output_loc = DATA_DIR/"raw/koumura/"

In [3]:
# # download the files locally
# for url, filename in data_urls:
#     # print(url, filename)
#     download_tqdm(url, output_location=output_loc/filename)

In [1]:
# # list the downloaded files
# zip_files = list((output_loc/"zip_contents").glob('*.zip'))
# zip_files[:2]

In [31]:
# # unzip the files
# for zf in tqdm(zip_files):
#     unzip_file(zf, output_loc/"zip_contents")

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/342 [00:00<?, ?it/s]

  0%|          | 0/346 [00:00<?, ?it/s]

  0%|          | 0/409 [00:00<?, ?it/s]

  0%|          | 0/448 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/317 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/224 [00:00<?, ?it/s]

  0%|          | 0/149 [00:00<?, ?it/s]

  0%|          | 0/322 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

### Getting the data into a usable format
Now that the data is saved, we want to get the annotations into the same format as all of the other datasets. 

The format we use is JSON, which just holds a dictionary of information about the dataset. 


For each .WAV file, we will create a JSON that looks something like this:

```
{
    "length_s": 15,
    "samplerate_hz": 30000,
    "wav_location": "/location/of/my/dataset/myfile.wav",
    "indvs": {
        "Bird1": {
            "species": "Bengalese finch",
            "units": {
                "syllables": {
                    "start_times": [1.5, 2.5, 6],
                    "end_times": [2.3, 4.5, 8],
                    "labels": ["a", "b", "c"],
                },
        }
    },
}
```

To get data into this format, you're generally going to have two write a custom parser to convert your data from your format into AVGN format. We're going to create a custom parser here for this dataset, as an example. You could also create these JSONs by hand. 

**Note:** If your dataset is more annotated than that, take a look at the readme.md in the github repository for more examples of JSONs. If your dataset is not already segmented for syllables, don't add "units", and you can add them after automatic segmentation.

In [2]:
from datetime import datetime
import avgn.utils
import numpy as np

In [5]:
RAW_DATASET_LOC = output_loc/"zip_contents"
# RAW_DATASET_LOC = '/mnt/cube/tsainbur/Projects/github_repos/avgn_paper/data/raw/koumura/zip_contents'
RAW_DATASET_LOC

PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents')

In [6]:
# first we create a name for our dataset
DATASET_ID = 'koumura_bengalese_finch'

# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [7]:
# grab a list of all the raw waveforms
wav_list = list(RAW_DATASET_LOC.glob('Bird*/Wave/*.wav'))
len(wav_list), np.sort(wav_list)[-2:]

(2965,
 array([PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird9/Wave/98.wav'),
        PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird9/Wave/99.wav')],
       dtype=object))

In [8]:
# grab a list of all of the raw annotation files for each bird
annotation_files = list(RAW_DATASET_LOC.glob('Bird*/Annotation.xml'))
len(annotation_files), np.sort(annotation_files)[-2:]

(11,
 array([PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird8/Annotation.xml'),
        PosixPath('/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird9/Annotation.xml')],
       dtype=object))

#### Now, for each wav file, we want to generate a JSON, using information from the XML.

Lets take a look inside an XML first, to see what's in there. It might be useful to take a look at this XML file in your web browser to get a better idea of what's in there as well.

In [9]:
import xml.etree.ElementTree
import xml.dom.minidom

In [11]:
# print a sample of the XML
parssed  = xml.dom.minidom.parse(annotation_files[0].as_posix()) 
pretty_xml_as_string = parssed.toprettyxml()
# print(pretty_xml_as_string[:400] + '...')

In [12]:
import pandas as pd

In [14]:
song_df = pd.DataFrame(
        columns=[
            "bird",
            "WaveFileName",
            "Position",
            "Length",
            "NumNote",
            "NotePositions",
            "NoteLengths",
            "NoteLabels",
        ]
    )
song_df

Unnamed: 0,bird,WaveFileName,Position,Length,NumNote,NotePositions,NoteLengths,NoteLabels


In [15]:
# loop through XML annotation files
for bird_loc in tqdm(annotation_files):
    # grab the
    bird_xml = xml.etree.ElementTree.parse(bird_loc).getroot()
    bird = bird_loc.parent.stem
    # loop through each "sequence" in the datset (corresponding to a bout)
    # for element in tqdm(bird_xml.getchildren(), leave=False):
    for element in tqdm(list(bird_xml)):
        if element.tag == "Sequence":
            notePositions = []
            noteLengths = []
            noteLabels = []
            # get the metadata for that sequence 
            # for seq_element in element.getchildren():
            for seq_element in list(element):
                if seq_element.tag == "Position":
                    position = seq_element.text
                elif seq_element.tag == "Length":
                    length = seq_element.text
                elif seq_element.tag == "WaveFileName":
                    WaveFileName = seq_element.text
                elif seq_element.tag == "NumNote":
                    NumNote = seq_element.text
                # get the metadata for the note
                elif seq_element.tag == "Note":
                    # for note_element in seq_element.getchildren():
                    for note_element in list(seq_element):
                        if note_element.tag == "Label":
                            noteLabels.append(note_element.text)
                        elif note_element.tag == "Position":
                            notePositions.append(note_element.text)
                        elif note_element.tag == "Length":
                            noteLengths.append(note_element.text)
            # add to the pandas dataframe
            song_df.loc[len(song_df)] = [
                bird,
                WaveFileName,
                position,
                length,
                NumNote,
                notePositions,
                noteLengths,
                noteLabels,
            ]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1238 [00:00<?, ?it/s]

  0%|          | 0/2501 [00:00<?, ?it/s]

  0%|          | 0/412 [00:00<?, ?it/s]

  0%|          | 0/1495 [00:00<?, ?it/s]

  0%|          | 0/419 [00:00<?, ?it/s]

  0%|          | 0/1854 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/1486 [00:00<?, ?it/s]

  0%|          | 0/2110 [00:00<?, ?it/s]

  0%|          | 0/1351 [00:00<?, ?it/s]

  0%|          | 0/1964 [00:00<?, ?it/s]

In [17]:
# song_df[:3]

### Now we can generate a JSON from that pandas dataframe

In [18]:
from avgn.utils.audio import get_samplerate
import librosa
from avgn.utils.json import NoIndent, NoIndentEncoder
import json

In [19]:
# for each bird
for bird in tqdm(np.unique(song_df.bird)):
    # grab that bird's annotations
    bird_df = song_df[song_df.bird == bird]
    
    # for each wav file produced by that bird
    for wfn in tqdm(bird_df.WaveFileName.unique(), leave=False):
        
        wfn_df = bird_df[bird_df.WaveFileName == wfn]
        
        # get the location of the wav
        wav_loc = RAW_DATASET_LOC / bird / "Wave" / wfn
    
        # get the wav samplerate and duration
        sr = get_samplerate(wav_loc.as_posix())
        wav_duration = librosa.get_duration(filename=wav_loc)
        
        # make json dictionary
        json_dict = {}
        # add species
        json_dict["species"] = "Lonchura striata domestica"
        json_dict["common_name"] = "Bengalese finch"
        json_dict["wav_loc"] = wav_loc.as_posix()
        # rate and length
        json_dict["samplerate_hz"] = sr
        json_dict["length_s"] = wav_duration
        
        # make a dataframe of wav info
        seq_df = pd.DataFrame(
            (
                [
                    [
                        list(np.repeat(sequence_num, len(row.NotePositions))),
                        list(row.NoteLabels),
                        np.array(
                            (np.array(row.NotePositions).astype("int") + int(row.Position))
                            / sr
                        ).astype("float64"),
                        np.array(
                            (
                                np.array(row.NotePositions).astype("int")
                                + np.array(row.NoteLengths).astype("int")
                                + int(row.Position)
                            )
                            / sr
                        ).astype("float64"),
                    ]
                    for sequence_num, (idx, row) in enumerate(wfn_df.iterrows())
                ]
            ),
            columns=["sequence_num", "labels", "start_times", "end_times"],
        )
        
        # add syllable information
        json_dict["indvs"] = {
            bird: {
                "notes": {
                    "start_times": NoIndent(
                        list(np.concatenate(seq_df.start_times.values))
                    ),
                    "end_times": NoIndent(list(np.concatenate(seq_df.end_times.values))),
                    "labels": NoIndent(list(np.concatenate(seq_df.labels.values))),
                    "sequence_num": NoIndent(
                        [int(i) for i in np.concatenate(seq_df.sequence_num.values)]
                    ),
                }
            }
        }
        
        
        # dump dict into json format
        json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)

        wav_stem = bird + "_" + wfn.split(".")[0]
        json_out = (
            DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")
        )

        # save json
        avgn.utils.paths.ensure_dir(json_out.as_posix())
        print(json_txt, file=open(json_out.as_posix(), "w"))
        

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

	This alias will be removed in version 1.0.
  wav_duration = librosa.get_duration(filename=wav_loc)


  0%|          | 0/315 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/402 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/335 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/217 [00:00<?, ?it/s]

In [20]:
# print an example JSON corresponding to the dataset we just made
print(json_txt)

{
  "species": "Lonchura striata domestica",
  "common_name": "Bengalese finch",
  "wav_loc": "/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird9/Wave/216.wav",
  "samplerate_hz": 32000,
  "length_s": 11.124,
  "indvs": {
    "Bird9": {
      "notes": {
        "start_times": [1.158, 1.302, 1.451, 1.605, 1.761, 1.92, 2.094, 2.243, 2.404, 2.563, 2.713, 2.846, 2.971, 3.082, 3.157, 3.262, 3.372, 3.487, 3.6, 3.715, 3.833, 3.949, 4.068, 4.142, 4.249, 4.364, 4.474, 4.587, 4.698, 4.808, 4.916, 5.031, 5.146, 5.275, 5.4, 5.547, 5.657, 5.735, 5.844, 5.959, 6.078, 6.195, 6.313, 6.43, 6.5105, 6.6185, 6.7325, 6.8495, 6.9635, 7.0785, 7.1925, 7.3035, 7.4345, 7.5855, 7.7345, 7.8515, 7.9285, 8.0385, 8.1555, 8.2775, 8.3945, 8.5135, 8.6335, 8.7545, 8.8755, 8.9955, 9.0765, 9.1865, 9.3025, 9.4185, 9.5335, 9.6505, 9.7665, 9.8785],
        "end_times": [1.204, 1.372, 1.509, 1.673, 1.837, 1.995, 2.173, 2.327, 2.491, 2.647, 2.791, 2.916, 3.063, 3.138, 3.242, 3.35, 3.461, 3.578, 3.691, 3