# Evaluation

In [1]:
import sys, os
sys.path.insert(0, os.path.dirname(os.getcwd()))

In [2]:
from model import WhisperSegmenterFast, WhisperSegmenter
import librosa
import numpy as np
from tqdm import tqdm
from copy import deepcopy
from train import evaluate
from datautils import get_audio_and_label_paths
import os
from audio_utils import SpecViewer
import subprocess
from glob import glob
import json

In [3]:
def evaluate_dataset( dataset_folder, model_path, num_trials, consolidation_method = "clustering",
                      max_length = 448, num_beams = 4, batch_size = 8, tolerance = None, spec_time_step = None, eps = None ):
    audio_list, label_list = [], []
    audio_paths, label_paths = get_audio_and_label_paths(dataset_folder)
    for audio_path, label_path in zip(audio_paths, label_paths):
        label = json.load( open( label_path ) )
        audio, _ = librosa.load( audio_path, sr = label["sr"] )
        audio_list.append(audio)
        label_list.append(label) 
        
        if tolerance is not None:
            label["tolerance"] = tolerance
        if spec_time_step is not None:
            label["spec_time_step"] = spec_time_step
        if eps is not None:
            label["eps"] = eps 


    segmenter = WhisperSegmenterFast(  model_path = model_path,  device = "cuda")
    res = evaluate( audio_list, label_list, segmenter, batch_size, max_length, num_trials, consolidation_method, num_beams, 
                    target_cluster = None
                  )

    all_res = {
        "segment_wise_scores": {"N-true-positive": res["segment_wise"][0],
                                "N-positive-in-prediction": res["segment_wise"][1],
                                "N-positive-in-ground-truth": res["segment_wise"][2],
                                "precision": res["segment_wise"][3],
                                "recall": res["segment_wise"][4],
                                "F1": res["segment_wise"][5]
                                },
        "frame_wise_scores": {"N-true-positive": res["frame_wise"][0],
                                "N-positive-in-prediction": res["frame_wise"][1],
                                "N-positive-in-ground-truth": res["frame_wise"][2],
                                "precision": res["frame_wise"][3],
                                "recall": res["frame_wise"][4],
                                "F1": res["frame_wise"][5]
                                }
    }
    return all_res

In [4]:
evaluate_dataset( "../data/datasets/meerkat/data/test/", 
                 "nccratliri/whisperseg-meerkat-vad-ct2", 
                  num_trials = 3, tolerance = 0.02 )

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

hf_model/special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

hf_model/added_tokens.json:   0%|          | 0.00/22.2k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/121 [00:00<?, ?B/s]

hf_model/config.json:   0%|          | 0.00/2.66k [00:00<?, ?B/s]

hf_model/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

hf_model/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

hf_model/tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

hf_model/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

vocabulary.json:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:18<00:00,  3.06s/it]


{'segment_wise_scores': {'N-true-positive': 207,
  'N-positive-in-prediction': 256,
  'N-positive-in-ground-truth': 274,
  'precision': 0.80859375,
  'recall': 0.7554744525547445,
  'F1': 0.7811320754716982},
 'frame_wise_scores': {'N-true-positive': 40393,
  'N-positive-in-prediction': 42145,
  'N-positive-in-ground-truth': 46324,
  'precision': 0.9584292324119112,
  'recall': 0.8719670149382609,
  'F1': 0.9131560207530321}}

In [5]:
evaluate_dataset( "../data/example_subset/Meerkat/test/", 
                 "nccratliri/whisperseg-meerkat-vad-ct2", 
                  num_trials = 3, tolerance = 0.02 )

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

model.bin:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:11<00:00,  1.92s/it]


{'segment_wise_scores': {'N-true-positive': 207,
  'N-positive-in-prediction': 255,
  'N-positive-in-ground-truth': 274,
  'precision': 0.8117647058823529,
  'recall': 0.7554744525547445,
  'F1': 0.782608695652174},
 'frame_wise_scores': {'N-true-positive': 40264,
  'N-positive-in-prediction': 42009,
  'N-positive-in-ground-truth': 46324,
  'precision': 0.9584612821062153,
  'recall': 0.8691822813228564,
  'F1': 0.9116411760044377}}

In [6]:
import requests,json,base64

## define a function for segmentation
def call_segment_service( service_address, 
                          audio_file_path,
                          channel_id,
                          sr,
                          min_frequency,
                          spec_time_step,
                          min_segment_length,
                          eps,
                          num_trials
                        ):
    audio_file_base64_string = base64.b64encode( open(audio_file_path, 'rb').read()).decode('ASCII')
    response = requests.post( service_address,
                              data = json.dumps( {
                                  "audio_file_base64_string":audio_file_base64_string,
                                  "channel_id":channel_id,
                                  "sr":sr,
                                  "min_frequency":min_frequency,
                                  "spec_time_step":spec_time_step,
                                  "min_segment_length":min_segment_length,
                                  "eps":eps,
                                  "num_trials":num_trials
                              } ),
                              headers = {"Content-Type": "application/json"}
                            )
    return response.json()

# Visualization

In [7]:
from model import WhisperSegmenterFast, WhisperSegmenter
from audio_utils import SpecViewer
segmenter = WhisperSegmenterFast( "nccratliri/whisperseg-meerkat-vad-ct2", device="cuda" )
# segmenter = WhisperSegmenter( "nccratliri/whisperseg-large-vad-v1.0", device="cuda" )
spec_viewer = SpecViewer()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
audio_file = "../data/example_subset/Meerkat/test/VLM298_L_4_27DEC2022_MF_ML.wav"
label_file = audio_file[:-4] + ".json"
data = json.load(open(label_file))

In [9]:
sr = data["sr"]  
min_frequency = data["min_frequency"]
spec_time_step = data["spec_time_step"]
min_segment_length = data["min_segment_length"]
eps = data["eps"]
num_trials = 1

audio, _ = librosa.load( audio_file, sr = sr )
label = json.load( open(label_file) )

prediction = segmenter.segment(  audio, sr = sr, min_frequency = min_frequency, spec_time_step = spec_time_step,
                       min_segment_length = min_segment_length, eps = eps,num_trials = num_trials, batch_size=4 )
spec_viewer.visualize( audio = audio, sr = sr, min_frequency= min_frequency, prediction = prediction, label=label, 
                       window_size=5, precision_bits=1 )

interactive(children=(FloatSlider(value=13.5, description='offset', max=27.0345625, step=0.25), Output()), _do…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>(*args, **kwargs)>

In [25]:
prediction2 = call_segment_service( "http://localhost:8050/segment", 
                          audio_file,   
                          channel_id = 0,
                          sr = sr,
                          min_frequency = min_frequency,
                          spec_time_step = spec_time_step,
                          min_segment_length = min_segment_length,
                          eps = eps,
                          num_trials = num_trials
                        )
spec_viewer.visualize( audio = audio, sr = sr, min_frequency= min_frequency, prediction = prediction2, label=label, 
                       window_size=5, precision_bits=1 )

interactive(children=(FloatSlider(value=13.5, description='offset', max=27.0345625, step=0.25), Output()), _do…

<function ipywidgets.widgets.interaction._InteractFactory.__call__.<locals>.<lambda>(*args, **kwargs)>

In [23]:
prediction == prediction2

True