In [1]:
import numpy as np
import scipy
import librosa
import os
import matplotlib
import matplotlib.pyplot as plt
import IPython
import math
import pandas as pd

import statsmodels.api as sa
import statsmodels.formula.api as sfa
import scikit_posthocs as sp

import sklearn
from sklearn.decomposition import SparseCoder
from sklearn.decomposition import DictionaryLearning
import pickle
import parselmouth

In [None]:
# training dataset
read_in = []
for bird_num in ['b1053', 'B338']:
    os.chdir( (f'/mnt/cube/Datasets/public_ds_starlings_ts_2019/{bird_num}/wavs').replace('.', 'p') )
    for fileName in os.listdir(os.getcwd()):
            if fileName.endswith('.wav'):
                temp_wav, temp_sr = librosa.load(fileName)
                read_in = np.append(read_in, temp_wav)
train_spc = librosa.feature.melspectrogram(y = read_in, sr = temp_sr, n_fft = 2048, hop_length = 512, win_length = 1024)
train_spc_db = librosa.power_to_db(train_spc, ref = np.max, top_db = 60)


In [None]:

dict_learner = DictionaryLearning(n_components = 15, alpha = 0.1, max_iter = 1000, tol = 1e-3)

train_transform = dict_learner.fit_transform(train_spc_db.T)
coder = SparseCoder(dictionary = dict_learner.components_, transform_algorithm = 'lasso_lars', transform_alpha = 1e-10)

In [None]:
# Xcomp = coder.transform( train_spc_db.T )
# hist_quant = []
# for i in range(0, Xcomp.shape[1]):
#     hist_quant.append(np.quantile(Xcomp[:,i], (0.01, 0.99)))



## General use, label data for each song

In [None]:
from datetime import datetime
import avgn.utils
import numpy as np

In [None]:
# first we create a name for our dataset
DATASET_ID = 'koumura_bengalese_finch'

# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# grab a list of all the raw waveforms
wav_list = list(RAW_DATASET_LOC.glob('Bird*/Wave/*.wav'))
len(wav_list), np.sort(wav_list)[-2:]

In [None]:
# grab a list of all of the raw annotation files for each bird
annotation_files = list(RAW_DATASET_LOC.glob('Bird*/Annotation.xml'))
len(annotation_files), np.sort(annotation_files)[-2:]

In [None]:
import xml.etree.ElementTree
import xml.dom.minidom

In [None]:
# print a sample of the XML
parssed  = xml.dom.minidom.parse(annotation_files[0].as_posix()) 
pretty_xml_as_string = dom.toprettyxml()
# print(pretty_xml_as_string[:400] + '...')

In [None]:
song_df = pd.DataFrame(
        columns=[
            "bird",
            "WaveFileName",
            "Position",
            "Length",
            "NumNote",
            "NotePositions",
            "NoteLengths",
            "NoteLabels",
        ]
    )
song_df

In [None]:
# loop through XML annotation files
for bird_loc in tqdm(annotation_files):
    # grab the
    bird_xml = xml.etree.ElementTree.parse(bird_loc).getroot()
    bird = bird_loc.parent.stem
    # loop through each "sequence" in the datset (corresponding to a bout)
    for element in tqdm(bird_xml.getchildren(), leave=False):
        if element.tag == "Sequence":
            notePositions = []
            noteLengths = []
            noteLabels = []
            # get the metadata for that sequence 
            for seq_element in element.getchildren():
                if seq_element.tag == "Position":
                    position = seq_element.text
                elif seq_element.tag == "Length":
                    length = seq_element.text
                elif seq_element.tag == "WaveFileName":
                    WaveFileName = seq_element.text
                elif seq_element.tag == "NumNote":
                    NumNote = seq_element.text
                # get the metadata for the note
                elif seq_element.tag == "Note":
                    for note_element in seq_element.getchildren():
                        if note_element.tag == "Label":
                            noteLabels.append(note_element.text)
                        elif note_element.tag == "Position":
                            notePositions.append(note_element.text)
                        elif note_element.tag == "Length":
                            noteLengths.append(note_element.text)
            # add to the pandas dataframe
            song_df.loc[len(song_df)] = [
                bird,
                WaveFileName,
                position,
                length,
                NumNote,
                notePositions,
                noteLengths,
                noteLabels,
            ]

In [None]:
from avgn.utils.audio import get_samplerate
import librosa
from avgn.utils.json import NoIndent, NoIndentEncoder

In [None]:
# for each bird
for bird in tqdm(np.unique(song_df.bird)):
    # grab that bird's annotations
    bird_df = song_df[song_df.bird == bird]
    
    # for each wav file produced by that bird
    for wfn in tqdm(bird_df.WaveFileName.unique(), leave=False):
        
        wfn_df = bird_df[bird_df.WaveFileName == wfn]
        
        # get the location of the wav
        wav_loc = RAW_DATASET_LOC / bird / "Wave" / wfn
    
        # get the wav samplerate and duration
        sr = get_samplerate(wav_loc.as_posix())
        wav_duration = librosa.get_duration(filename=wav_loc)
        
        # make json dictionary
        json_dict = {}
        # add species
        json_dict["species"] = "Lonchura striata domestica"
        json_dict["common_name"] = "Bengalese finch"
        json_dict["wav_loc"] = wav_loc.as_posix()
        # rate and length
        json_dict["samplerate_hz"] = sr
        json_dict["length_s"] = wav_duration
        
        # make a dataframe of wav info
        seq_df = pd.DataFrame(
            (
                [
                    [
                        list(np.repeat(sequence_num, len(row.NotePositions))),
                        list(row.NoteLabels),
                        np.array(
                            (np.array(row.NotePositions).astype("int") + int(row.Position))
                            / sr
                        ).astype("float64"),
                        np.array(
                            (
                                np.array(row.NotePositions).astype("int")
                                + np.array(row.NoteLengths).astype("int")
                                + int(row.Position)
                            )
                            / sr
                        ).astype("float64"),
                    ]
                    for sequence_num, (idx, row) in enumerate(wfn_df.iterrows())
                ]
            ),
            columns=["sequence_num", "labels", "start_times", "end_times"],
        )
        
        # add syllable information
        json_dict["indvs"] = {
            bird: {
                "notes": {
                    "start_times": NoIndent(
                        list(np.concatenate(seq_df.start_times.values))
                    ),
                    "end_times": NoIndent(list(np.concatenate(seq_df.end_times.values))),
                    "labels": NoIndent(list(np.concatenate(seq_df.labels.values))),
                    "sequence_num": NoIndent(
                        [int(i) for i in np.concatenate(seq_df.sequence_num.values)]
                    ),
                }
            }
        }
        
        
        # dump dict into json format
        json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)

        wav_stem = bird + "_" + wfn.split(".")[0]
        json_out = (
            DATA_DIR / "processed" / DATASET_ID / DT_ID / "JSON" / (wav_stem + ".JSON")
        )

        # save json
        avgn.utils.paths.ensure_dir(json_out.as_posix())
        print(json_txt, file=open(json_out.as_posix(), "w"))
        

## Segmentation

In [None]:
os.getcwd()
os.chdir('/mnt/cube/ntansey/vocalization_segmentation/')
from vocalseg.utils import butter_bandpass_filter, spectrogram, int16tofloat32, plot_spec
from vocalseg.continuity_filtering import continuity_segmentation
from vocalseg.continuity_filtering import plot_labelled_elements


In [None]:
# 3 versions of data
b1, srb1 = librosa.load('example_birdsong.wav')

mel_spc = librosa.feature.melspectrogram(y = temp_song, sr = temp_rate, n_fft = 2048, hop_length = 512, win_length = 1024)

b1_spec = librosa.power_to_db(mel_spc, ref = np.max)
b1_sparse = coder.transform(b1_spec.T)  # 
b1_srecon = Xb1 @ coder.dictionary   # reconstructed from sparse elements back to mel spec

In [None]:
### segmentation parameters
n_fft=1024
hop_length_ms=2
win_length_ms=4
ref_level_db=20
pre=0.97
min_level_db=-60
min_level_db_floor = -20
db_delta = 5
silence_threshold = 0.05
min_silence_for_spec=0.5
max_vocal_for_spec=0.5,
min_syllable_length_s = 0.01
butter_min = 500
butter_max = 15000
spectral_range = [500, 15000]

In [None]:
# segments on each song from waveform, keep differences of clusterability from being related to segmenting

# results = dynamic_threshold_segmentation(
#     b1,
#     srb1,
#     n_fft=n_fft,
#     hop_length_ms=hop_length_ms,
#     win_length_ms=win_length_ms,
#     ref_level_db=ref_level_db,
#     pre=pre,
#     min_level_db=min_level_db,
#     silence_threshold = silence_threshold,
#     verbose=True,
#     min_syllable_length_s = 0.2
# )

# segment
results = dynamic_threshold_segmentation(
    data,
    rate,
    n_fft=n_fft,
    hop_length_ms=hop_length_ms,
    win_length_ms=win_length_ms,
    min_level_db_floor=min_level_db_floor,
    db_delta=db_delta,
    ref_level_db=ref_level_db,
    pre=pre,
    min_silence_for_spec=min_silence_for_spec,
    max_vocal_for_spec=max_vocal_for_spec,
    min_level_db=min_level_db,
    silence_threshold=silence_threshold,
    verbose=True,
    min_syllable_length_s=min_syllable_length_s,
    spectral_range=spectral_range,
)