In [1]:
import sys, os
sys.path.insert(0, os.path.dirname(os.getcwd()))

In [2]:
from model import WhisperSegmenterFast, WhisperSegmenter
import librosa
import numpy as np
from tqdm import tqdm
from copy import deepcopy
from train import evaluate
from datautils import get_audio_and_label_paths
import os
from audio_utils import SpecViewer
import subprocess
from glob import glob
import json

In [3]:
def evaluate_dataset( dataset_folder, model_path, num_trials, consolidation_method = "clustering",
                      max_length = 448, num_beams = 4, batch_size = 8, tolerance = None, spec_time_step = None, eps = None ):
    audio_list, label_list = [], []
    audio_paths, label_paths = get_audio_and_label_paths(dataset_folder)
    for audio_path, label_path in zip(audio_paths, label_paths):
        label = json.load( open( label_path ) )
        audio, _ = librosa.load( audio_path, sr = label["sr"] )
        audio_list.append(audio)
        label_list.append(label) 
        
        if tolerance is not None:
            label["tolerance"] = tolerance
        if spec_time_step is not None:
            label["spec_time_step"] = spec_time_step
        if eps is not None:
            label["eps"] = eps 


    segmenter = WhisperSegmenterFast(  model_path = model_path,  device = "cuda")
    res = evaluate( audio_list, label_list, segmenter, batch_size, max_length, num_trials, consolidation_method, num_beams, 
                    target_cluster = None
                  )

    all_res = {
        "segment_wise_scores": {"N-true-positive": res["segment_wise"][0],
                                "N-positive-in-prediction": res["segment_wise"][1],
                                "N-positive-in-ground-truth": res["segment_wise"][2],
                                "precision": res["segment_wise"][3],
                                "recall": res["segment_wise"][4],
                                "F1": res["segment_wise"][5]
                                },
        "frame_wise_scores": {"N-true-positive": res["frame_wise"][0],
                                "N-positive-in-prediction": res["frame_wise"][1],
                                "N-positive-in-ground-truth": res["frame_wise"][2],
                                "precision": res["frame_wise"][3],
                                "recall": res["frame_wise"][4],
                                "F1": res["frame_wise"][5]
                                }
    }
    return all_res

In [4]:
evaluate_dataset( "../data/datasets/multi_species_data/data/multi-species/test/", 
                 "../model/whisperseg-large-vad-v2.0/final_checkpoint_ct2/", 
                  num_trials = 1, tolerance = 0.02 )

100%|███████████████████████████████████████████████████████████████| 674/674 [03:22<00:00,  3.32it/s]


{'segment_wise_scores': {'N-true-positive': 7689,
  'N-positive-in-prediction': 8062,
  'N-positive-in-ground-truth': 7987,
  'precision': 0.9537335648722401,
  'recall': 0.9626893702266183,
  'F1': 0.9581905414667581},
 'frame_wise_scores': {'N-true-positive': 959267,
  'N-positive-in-prediction': 997094,
  'N-positive-in-ground-truth': 983355,
  'precision': 0.9620627543641823,
  'recall': 0.975504268550015,
  'F1': 0.9687368874432009}}

In [54]:
from model import WhisperSegmenterFast, WhisperSegmenter
from audio_utils import SpecViewer
segmenter = WhisperSegmenterFast( "../model/whisperseg-large-vad-v2.0/final_checkpoint_ct2/", device="cuda" )
# segmenter = WhisperSegmenter( "../model/whisperseg-large-vad-v2.0/final_checkpoint/", device="cuda" )
spec_viewer = SpecViewer()

In [55]:
audio_file = "../data/datasets/multi_species_data/data/multi-species/train/zebra_finch_R3428_40952.58642736_2_13_16_17_22.wav"
label_file = audio_file[:-4] + ".json"
data = json.load(open(label_file))

In [56]:
sr = data["sr"]  
min_frequency = data["min_frequency"]
spec_time_step = data["spec_time_step"]
min_segment_length = data["min_segment_length"]
eps = data["eps"]
num_trials = 1

audio, _ = librosa.load( audio_file, sr = sr )
label = json.load( open(label_file) )

prediction = segmenter.segment(  audio, sr = sr, min_frequency = min_frequency, spec_time_step = spec_time_step,
                       min_segment_length = min_segment_length, eps = eps,num_trials = num_trials, batch_size=4 )
spec_viewer.visualize( audio = audio, sr = sr, min_frequency= min_frequency, prediction = prediction, label=label, 
                       window_size=5, precision_bits=1 )

interactive(children=(FloatSlider(value=2.5, description='offset', max=5.0774687499999995, step=0.25), Output(…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>(*args, **kwargs)>