# In this notebook, we segment the waveform into putative syllables
When a dataset is not pre-segmented into individiual vocal units, we can try to segment it computationally. Here we'll use dynamic thresholding segmentation to segment bouts into syllables.

This works very well with low-noise datasets with clearly defined syllables like Bengalese finch song. You might need to rely on other methods for noisier data. 

You can also try denoising your data first, e.g. with [noisereduce](https://github.com/timsainb/noisereduce) to get better results.

You'll need to install the [vocalseg](https://github.com/timsainb/vocalization-segmentation) package to use this. 

In [3]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet
import tensorflow
import json

  from tqdm.autonotebook import tqdm
2024-11-19 16:57:10.941471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732064230.967152  865169 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732064230.975021  865169 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
DATASET_ID = 'koumura_bengalese_finch'

In [5]:
# create a set of hyperparameters for processing this dataset.  
hparams = HParams(
    num_mel_bins = 64,
    mel_lower_edge_hertz=500,
    mel_upper_edge_hertz=15000,
    butter_lowcut = 500,
    butter_highcut = 15000,
    ref_level_db = 20,
    min_level_db = -30,
    mask_spec = True,
    win_length_ms = 10,
    hop_length_ms = 2,
    nex=-1,
    n_jobs=-1,
    verbosity = 1,
)

### Create a dataset object
The dataset object loads JSONs corresponding to `DATASET_ID` in the data folder. 

In [6]:
# create a dataset object, which
dataset = DataSet(DATASET_ID, hparams = hparams)

loading json:   0%|          | 0/2964 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 654 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 2480 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 2964 out of 2964 | elapsed:    4.3s finished


getting unique individuals:   0%|          | 0/2964 [00:00<?, ?it/s]

2024-11-19 16:57:23.501051: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-11-19 16:57:23.501137: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:137] retrieving CUDA diagnostic information for host: txori.ucsd.edu
2024-11-19 16:57:23.501148: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:144] hostname: txori.ucsd.edu
2024-11-19 16:57:23.501281: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:168] libcuda reported version is: 520.61.5
2024-11-19 16:57:23.501317: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:172] kernel reported version is: 520.61.5
2024-11-19 16:57:23.501327: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:259] kernel version seems to match DSO: 520.61.5


In [7]:
# to make sure everything loaded correctly, lets look at a sample JSON
print(json.dumps(dataset.sample_json, indent=4, default=str)[:400] + '...')

{
    "species": "Lonchura striata domestica",
    "common_name": "Bengalese finch",
    "wav_loc": "/mnt/cube/ntansey/avgn_test/avgn_paper/data/raw/koumura/zip_contents/Bird4/Wave/221.wav",
    "samplerate_hz": 32000,
    "length_s": 12.592,
    "indvs": {
        "Bird4": {
            "notes": {
                "start_times": [
                    1.158,
                    1.317,
             ...


In [8]:
# how many wavs are in the dataset?
len(dataset.data_files)

2964

### Perform segmentation

In [9]:
import librosa

In [10]:
### segmentation parameters
n_fft=1024
hop_length_ms=2
win_length_ms=4
ref_level_db=20
pre=0.97
min_level_db=-60
min_level_db_floor = -20
db_delta = 5
silence_threshold = 0.05
min_silence_for_spec=0.5
max_vocal_for_spec=0.5,
min_syllable_length_s = 0.01
butter_min = 500
butter_max = 15000
spectral_range = [500, 15000]

In [11]:
os.getcwd()

NameError: name 'os' is not defined

In [12]:
# import os
# os.getcwd()
# current = os.getcwd()

# os.chdir('/mnt/cube/ntansey/vocalization_segmentation/')

# os.chdir(current)
# # # os.chdir('/mnt/cube/ntansey/vocalization_segmentation/')
# # from vocalization_segmentation.vocalseg.utils import butter_bandpass_filter, spectrogram, int16tofloat32, plot_spec
# # from vocalseg.continuity_filtering import continuity_segmentation
# # from vocalseg.continuity_filtering import plot_labelled_elements

# from ..vocalization_segmentation.vocalseg.utils import butter_bandpass

#### First lets try segmenting an example to make sure the segmentation looks good

In [13]:
from avgn.utils.audio import load_wav, read_wav

# os.getcwd()
# current = os.getcwd()

# os.chdir('/mnt/cube/ntansey/vocalization_segmentation/')

from vocalseg.dynamic_thresholding import dynamic_threshold_segmentation
from vocalseg.dynamic_thresholding import plot_segmented_spec, plot_segmentations
import matplotlib.pyplot as plt
import vocalseg

# os.chdir(current)

In [14]:
rate, data = load_wav(dataset.data_files['Bird8_108'].data["wav_loc"])

In [15]:
# segment
results = dynamic_threshold_segmentation(
    data,
    rate,
    n_fft=n_fft,
    hop_length_ms=hop_length_ms,
    win_length_ms=win_length_ms,
    min_level_db_floor=min_level_db_floor,
    db_delta=db_delta,
    ref_level_db=ref_level_db,
    pre=pre,
    min_silence_for_spec=min_silence_for_spec,
    max_vocal_for_spec=max_vocal_for_spec,
    min_level_db=min_level_db,
    silence_threshold=silence_threshold,
    verbose=True,
    min_syllable_length_s=min_syllable_length_s,
    spectral_range=spectral_range,
)

                                                                               

longest silence 1.0999999999999996
longest vocalization 0.1039999999999992




In [None]:
plot_segmentations(
    results["spec"],
    results["vocal_envelope"],
    results["onsets"],
    results["offsets"],
    hop_length_ms,
    rate,
    figsize=(20,5)
)
plt.show()

### Segment the full dataset 
- for each json, load the wav file - segment the file into start and end times
- plot the segmentation
- add to the JSON

In [17]:
from avgn.utils.json import NoIndent, NoIndentEncoder
import joblib
import warnings
import numpy as np
from tqdm.autonotebook import tqdm

In [18]:
from avgn.signalprocessing.filtering import butter_bandpass_filter

In [19]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [20]:
warnings.filterwarnings("ignore", message = "'tqdm_notebook' object has no attribute 'sp'")

In [21]:
def segment_spec_custom(key, df, save=False, plot=False):
    # load wav
    rate, data = load_wav(df.data["wav_loc"])
    # filter data
    data = butter_bandpass_filter(data, butter_min, butter_max, rate)

    # segment
    results = dynamic_threshold_segmentation(
        data,
        rate,
        n_fft=n_fft,
        hop_length_ms=hop_length_ms,
        win_length_ms=win_length_ms,
        min_level_db_floor=min_level_db_floor,
        db_delta=db_delta,
        ref_level_db=ref_level_db,
        pre=pre,
        min_silence_for_spec=min_silence_for_spec,
        max_vocal_for_spec=max_vocal_for_spec,
        min_level_db=min_level_db,
        silence_threshold=silence_threshold,
        verbose=True,
        min_syllable_length_s=min_syllable_length_s,
        spectral_range=spectral_range,
    )
    if results is None:
        return
    if plot:
        plot_segmentations(
            results["spec"],
            results["vocal_envelope"],
            results["onsets"],
            results["offsets"],
            hop_length_ms,
            rate,
            figsize=(100, 5),
        )
        plt.show()

    # save the results
    json_out = (
        DATA_DIR
        / "processed"
        / (DATASET_ID + "_segmented")
        / DT_ID
        / "JSON"
        / (key + ".JSON")
    )

    json_dict = df.data.copy()

    json_dict["indvs"][list(df.data["indvs"].keys())[0]]["syllables"] = {
        "start_times": NoIndent(list(results["onsets"])),
        "end_times": NoIndent(list(results["offsets"])),
    }

    json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)
    # save json
    if save:
        ensure_dir(json_out.as_posix())
        print(json_txt, file=open(json_out.as_posix(), "w"))

    print(json_txt)

    return results

In [22]:
indvs = np.array(['_'.join(list(i)) for i in dataset.json_indv])
np.unique(indvs)

array(['Bird0', 'Bird1', 'Bird10', 'Bird2', 'Bird3', 'Bird4', 'Bird5',
       'Bird6', 'Bird7', 'Bird8', 'Bird9'], dtype='<U6')

In [23]:
from datetime import datetime

In [24]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2024-11-19_16-57-41'

In [None]:
nex = 3
for indv in tqdm(np.unique(indvs), desc="individuals"):
    print(indv)
    indv_keys = np.array(list(dataset.data_files.keys()))[indvs == indv][:nex]

    joblib.Parallel(n_jobs=1, verbose=11)(
            joblib.delayed(segment_spec_custom)(key, dataset.data_files[key], plot=True) 
                 for key in tqdm(indv_keys, desc="files", leave=False)
        )

### Generate segmentations for the full dataset

In [None]:

# os.getcwd()
# current = os.getcwd()

# os.chdir('/mnt/cube/ntansey/vocalization_segmentation/')


nex = -1
for indv in tqdm(np.unique(indvs), desc="individuals"):
    print(indv)
    indv_keys = np.array(list(dataset.data_files.keys()))[indvs == indv]

    joblib.Parallel(n_jobs=-1, verbose=11)(
            joblib.delayed(segment_spec_custom)(key, dataset.data_files[key], save=True) 
                 for key in tqdm(indv_keys, desc="files", leave=False)
        )


# os.chdir(current)
