# Evaluation

In [7]:
import sys, os
sys.path.insert(0, os.path.dirname(os.getcwd()))

In [8]:
from model import WhisperSegmenterFast, WhisperSegmenter
import librosa
import numpy as np
from tqdm import tqdm
from copy import deepcopy
from train import evaluate
from datautils import get_audio_and_label_paths
import os
from audio_utils import SpecViewer
import subprocess
from glob import glob
import json

In [3]:
def evaluate_dataset( dataset_folder, model_path, num_trials, consolidation_method = "clustering",
                      max_length = 448, num_beams = 4, batch_size = 8, tolerance = None, spec_time_step = None, eps = None ):
    audio_list, label_list = [], []
    audio_paths, label_paths = get_audio_and_label_paths(dataset_folder)
    for audio_path, label_path in zip(audio_paths, label_paths):
        label = json.load( open( label_path ) )
        audio, _ = librosa.load( audio_path, sr = label["sr"] )
        audio_list.append(audio)
        label_list.append(label) 
        
        if tolerance is not None:
            label["tolerance"] = tolerance
        if spec_time_step is not None:
            label["spec_time_step"] = spec_time_step
        if eps is not None:
            label["eps"] = eps 


    segmenter = WhisperSegmenterFast(  model_path = model_path,  device = "cuda")
    res = evaluate( audio_list, label_list, segmenter, batch_size, max_length, num_trials, consolidation_method, num_beams, 
                    target_cluster = None
                  )

    all_res = {
        "segment_wise_scores": {"N-true-positive": res["segment_wise"][0],
                                "N-positive-in-prediction": res["segment_wise"][1],
                                "N-positive-in-ground-truth": res["segment_wise"][2],
                                "precision": res["segment_wise"][3],
                                "recall": res["segment_wise"][4],
                                "F1": res["segment_wise"][5]
                                },
        "frame_wise_scores": {"N-true-positive": res["frame_wise"][0],
                                "N-positive-in-prediction": res["frame_wise"][1],
                                "N-positive-in-ground-truth": res["frame_wise"][2],
                                "precision": res["frame_wise"][3],
                                "recall": res["frame_wise"][4],
                                "F1": res["frame_wise"][5]
                                }
    }
    return all_res

In [4]:
evaluate_dataset( "../data/datasets/multi_species_data/data/multi-species/test/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 1, tolerance = 0.01 )

100%|███████████████████████████████████████████████████████████████| 674/674 [03:22<00:00,  3.33it/s]


{'segment_wise_scores': {'N-true-positive': 7725,
  'N-positive-in-prediction': 8003,
  'N-positive-in-ground-truth': 7987,
  'precision': 0.9652630263651131,
  'recall': 0.9671966946287718,
  'F1': 0.9662288930581615},
 'frame_wise_scores': {'N-true-positive': 963293,
  'N-positive-in-prediction': 994371,
  'N-positive-in-ground-truth': 983355,
  'precision': 0.968746071637246,
  'recall': 0.9795984156281302,
  'F1': 0.9741420196730993}}

In [24]:
evaluate_dataset( "../data/datasets/marmoset/test/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 3, tolerance = 0.01 )

100%|███████████████████████████████████████████████████████████████████| 7/7 [01:25<00:00, 12.17s/it]


{'segment_wise_scores': {'N-true-positive': 579,
  'N-positive-in-prediction': 709,
  'N-positive-in-ground-truth': 702,
  'precision': 0.8166431593794076,
  'recall': 0.8247863247863247,
  'F1': 0.8206945428773919},
 'frame_wise_scores': {'N-true-positive': 69959,
  'N-positive-in-prediction': 79660,
  'N-positive-in-ground-truth': 75877,
  'precision': 0.8782199347225709,
  'recall': 0.922005350765054,
  'F1': 0.8995801642053016}}

In [23]:
evaluate_dataset( "../data/datasets/mouse/test/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 3, tolerance = 0.01 )

100%|███████████████████████████████████████████████████████████████████| 3/3 [00:25<00:00,  8.48s/it]


{'segment_wise_scores': {'N-true-positive': 213,
  'N-positive-in-prediction': 216,
  'N-positive-in-ground-truth': 229,
  'precision': 0.9861111111111112,
  'recall': 0.9301310043668122,
  'F1': 0.957303370786517},
 'frame_wise_scores': {'N-true-positive': 10621,
  'N-positive-in-prediction': 10774,
  'N-positive-in-ground-truth': 10852,
  'precision': 0.9857991460924448,
  'recall': 0.9787136011795061,
  'F1': 0.9822435956718765}}

In [20]:
evaluate_dataset( "../data/datasets/human/test/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 1, tolerance = 0.2 )

100%|███████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.25s/it]


{'segment_wise_scores': {'N-true-positive': 27,
  'N-positive-in-prediction': 74,
  'N-positive-in-ground-truth': 93,
  'precision': 0.36486486486486486,
  'recall': 0.2903225806451613,
  'F1': 0.32335329341317365},
 'frame_wise_scores': {'N-true-positive': 31059,
  'N-positive-in-prediction': 34378,
  'N-positive-in-ground-truth': 32231,
  'precision': 0.903455698411775,
  'recall': 0.9636374918556669,
  'F1': 0.9325766788271854}}

In [22]:
evaluate_dataset( "../data/datasets/zebra_finch/test_juveniles/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 3, tolerance = 0.01 )

100%|█████████████████████████████████████████████████████████████████| 84/84 [01:03<00:00,  1.32it/s]


{'segment_wise_scores': {'N-true-positive': 943,
  'N-positive-in-prediction': 990,
  'N-positive-in-ground-truth': 980,
  'precision': 0.9525252525252526,
  'recall': 0.9622448979591837,
  'F1': 0.9573604060913707},
 'frame_wise_scores': {'N-true-positive': 128640,
  'N-positive-in-prediction': 131043,
  'N-positive-in-ground-truth': 130616,
  'precision': 0.9816625077264715,
  'recall': 0.984871684939058,
  'F1': 0.9832644778127256}}

In [25]:
evaluate_dataset( "../data/datasets/zebra_finch/test_adults/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 3, tolerance = 0.01 )

100%|███████████████████████████████████████████████████████████████| 199/199 [02:25<00:00,  1.37it/s]


{'segment_wise_scores': {'N-true-positive': 3895,
  'N-positive-in-prediction': 3907,
  'N-positive-in-ground-truth': 3909,
  'precision': 0.9969285897107756,
  'recall': 0.9964185213609619,
  'F1': 0.9966734902763562},
 'frame_wise_scores': {'N-true-positive': 588620,
  'N-positive-in-prediction': 594305,
  'N-positive-in-ground-truth': 594148,
  'precision': 0.9904342046592238,
  'recall': 0.9906959208816658,
  'F1': 0.9905650454834982}}

In [26]:
evaluate_dataset( "../data/datasets/bengalese_finch/test/", 
                 "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", 
                  num_trials = 3, tolerance = 0.01 )

100%|███████████████████████████████████████████████████████████████| 379/379 [01:45<00:00,  3.58it/s]


{'segment_wise_scores': {'N-true-positive': 2035,
  'N-positive-in-prediction': 2077,
  'N-positive-in-ground-truth': 2074,
  'precision': 0.9797785267212326,
  'recall': 0.9811957569913211,
  'F1': 0.9804866297277763},
 'frame_wise_scores': {'N-true-positive': 137038,
  'N-positive-in-prediction': 141645,
  'N-positive-in-ground-truth': 139631,
  'precision': 0.9674750255921494,
  'recall': 0.9814296252264898,
  'F1': 0.974402366359021}}

# Visualization

In [9]:
from model import WhisperSegmenterFast, WhisperSegmenter
from audio_utils import SpecViewer
# segmenter = WhisperSegmenterFast( "../model/whisperseg-large-vad-v0.1/final_checkpoint_ct2/", device="cuda" )
segmenter = WhisperSegmenter( "nccratliri/whisperseg-large-vad-v1.0", device="cuda" )
spec_viewer = SpecViewer()

config.json:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

In [11]:
audio_file = "../data/datasets/multi_species_data/data/multi-species/test/bengalese_finch_bl26lb16_190412_0721.20144_0.wav"
label_file = audio_file[:-4] + ".json"
data = json.load(open(label_file))

In [12]:
sr = data["sr"]  
min_frequency = data["min_frequency"]
spec_time_step = data["spec_time_step"]
min_segment_length = data["min_segment_length"]
eps = data["eps"]
num_trials = 1

audio, _ = librosa.load( audio_file, sr = sr )
label = json.load( open(label_file) )

prediction = segmenter.segment(  audio, sr = sr, min_frequency = min_frequency, spec_time_step = spec_time_step,
                       min_segment_length = min_segment_length, eps = eps,num_trials = num_trials, batch_size=4 )
spec_viewer.visualize( audio = audio, sr = sr, min_frequency= min_frequency, prediction = prediction, label=label, 
                       window_size=5, precision_bits=1 )

interactive(children=(FloatSlider(value=0.0, description='offset', max=0.0, step=0.25), Output()), _dom_classe…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>(*args, **kwargs)>