In [1]:
import requests, json
import base64
from IPython.display import Audio
import io
from PIL import Image
import base64

In [2]:
def bytes_to_base64_string(f_bytes):
    return base64.b64encode(f_bytes).decode('ASCII')

def base64_string_to_bytes(base64_string):
    return base64.b64decode(base64_string)

## Upload

### Upload from file

In [3]:
with open("example_audios/BP_2021-10-23_09-08-47_049985_0540000_daq1.wav", "rb") as f:
    file_data = f.read()

In [4]:
results = requests.post(
    "http://localhost:8050/upload",
    files = {"newAudioFile":file_data},
    data = { 
             "hop_length": None,
             "num_spec_columns": None,
             "sampling_rate": None,
             "spec_cal_method": None,
             "n_fft": None,
             "bins_per_octave": None,
             "min_frequency": None,
             "max_frequency": None
           }
).json()
audio_id = results["channels"][0]["audio_id"]

In [5]:
results["channels"][0]["audio_duration"]

419.4304375

In [6]:
results["configurations"]

{'bins_per_octave': None,
 'hop_length': 6710,
 'max_frequency': 8000,
 'min_frequency': 0,
 'n_fft': 512,
 'num_spec_columns': 1000,
 'sampling_rate': 16000,
 'spec_cal_method': 'log-mel'}

### Upload from URL

In [7]:
import requests, json
results = requests.post( 
    "http://localhost:8050/upload-by-url",
    data = json.dumps( {
             "audio_url":"https://www2.cs.uic.edu/~i101/SoundFiles/BabyElephantWalk60.wav" ,
             "spec_cal_method": "constant-q",
             "n_fft": None,
             "bins_per_octave": None,
             "hop_length": None,
             "num_spec_columns": None,
             "sampling_rate": None,
             "min_frequency": -100,
             "max_frequency": None
    }),
    headers = {"content-type":"application/json"}
).json()

## Get spectrogram

In [8]:
import requests, json
results = requests.post( 
    "http://localhost:8050/get-audio-clip-spec",
    data = json.dumps( {
             "audio_id":audio_id ,
             "start_time": 1.2,
             "spec_cal_method": None,
             "n_fft": None,
             "bins_per_octave": None,
             "hop_length": 160,
             "num_spec_columns": 1000,
             "sampling_rate": None,
             "min_frequency": 0,
             "max_frequency": None
    }),
    headers = {"content-type":"application/json"}
).json()

In [9]:
results.keys()

dict_keys(['configurations', 'freqs', 'spec'])

In [10]:
results["configurations"]

{'bins_per_octave': None,
 'hop_length': 160,
 'max_frequency': 8000,
 'min_frequency': 0,
 'n_fft': 512,
 'num_spec_columns': 1000,
 'sampling_rate': 16000,
 'spec_cal_method': 'log-mel'}

## Get Audio Clip

In [69]:
response = requests.post( 
    "http://localhost:8050/get-audio-clip-wav",
    data = json.dumps( {
        "audio_id":audio_id,
        "start_time":0,
        "clip_duration":100,
    }),
    headers = {"content-type":"application/json"}
).json()
Audio(base64.b64decode(response["wav"]))

## Post Labels

In [12]:
import requests, json

res = requests.post(
    'http://localhost:8050/post-annotations',
    data = json.dumps(
        {  
            "annotations": [
                {
                    "onset": 0,
                    "offset": 0,
                    "species": "SPECIES_NAME_HERE",
                    "individual": "INDIVIDUAL_NAME_HERE",
                    "filename": "FILENAME_HERE",
                    "annotation_instance": "ANNOTATION_INSTANCE_HERE"
                },
                ##  more annotations goes here
            ]
        
        }
    ),
    headers = { "Content-Type":"application/json",
                "accept":"application/json"
              }
).json()
res

{'message': 'Annotations inserted successfully.'}

## Delete Audio Ids

In [13]:
import requests, json

res = requests.post(
    'http://localhost:8050/release-audio-given-ids',
    data = json.dumps({ "audio_id_list": [ '152ca390-7ca7-48a0-b0c4-aa9617639753',
                                           'ddjjiu3m-huue-efrw-frff-bshah8773ksu',
                                         ] }),
    headers = { "Content-Type":"application/json",
                "accept":"application/json"
              }
).json()
res

{'status': 'success'}

## List Available Models

### List Models Available for Finetuning

In [14]:
response = requests.post( 
    "http://localhost:8050/list-models-available-for-finetuning",
    headers = {"content-type":"application/json"}
).json()
response

{'response': [{'eta': '--:--:--',
   'model_name': 'whisperseg-base',
   'status': 'ready'},
  {'eta': '--:--:--', 'model_name': 'whisperseg-large', 'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg_base',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-base-v2.0',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-large',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'new-whisperseg-bengalese-finch',
   'status': 'ready'}]}

### List Models Available for Inference

In [15]:
response = requests.post( 
    "http://localhost:8050/list-models-available-for-inference",
    headers = {"content-type":"application/json"}
).json()
response

{'response': [{'eta': '--:--:--',
   'model_name': 'whisperseg-base',
   'status': 'ready'},
  {'eta': '--:--:--', 'model_name': 'whisperseg-large', 'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg_base',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-base-v2.0',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-large',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'new-whisperseg-bengalese-finch',
   'status': 'ready'}]}

### List Models Being Trained

In [17]:
response = requests.post( 
    "http://localhost:8050/list-models-training-in-progress",
    headers = {"content-type":"application/json"}
).json()
response

{'response': []}

## Get WhisperSeg Segmentation

In [18]:
response = requests.post( 
    "http://localhost:8050/get-labels",
    data = json.dumps( {
        "audio_id":audio_id,
        "model_name":"whisperseg-large",
        "min_frequency": 0
    }),
    headers = {"content-type":"application/json"}
).json()

In [19]:
response["labels"][:5]

[{'clustername': 'Unknown',
  'individual': 'Unknown',
  'offset': 14.595,
  'onset': 14.497,
  'species': 'Unknown'},
 {'clustername': 'Unknown',
  'individual': 'Unknown',
  'offset': 15.108,
  'onset': 15.008,
  'species': 'Unknown'},
 {'clustername': 'Unknown',
  'individual': 'Unknown',
  'offset': 15.695,
  'onset': 15.572,
  'species': 'Unknown'},
 {'clustername': 'Unknown',
  'individual': 'Unknown',
  'offset': 16.772,
  'onset': 16.715,
  'species': 'Unknown'},
 {'clustername': 'Unknown',
  'individual': 'Unknown',
  'offset': 17.028,
  'onset': 16.983,
  'species': 'Unknown'}]

## Human-in-the-loop Training WhisperSeg Pipeline

In [1]:
import requests, json
import pandas as pd

### Upload file

In [10]:
with open("example_audios/human-in-the-loop-training-example/audio.wav", "rb") as f:
    file_data = f.read()
results = requests.post(
    "http://localhost:8050/upload",
    files = {"newAudioFile":file_data},
    data = { 
             "hop_length": None,
             "num_spec_columns": None,
             "sampling_rate": None,
             "spec_cal_method": None,
             "n_fft": None,
             "bins_per_octave": None,
             "min_frequency": None,
             "max_frequency": None
           }
).json()
audio_id = results["channels"][0]["audio_id"]
audio_id

'604d62fc-c11c-4c98-ae0f-cbe915fc2186'

### Human Annotation 
Suppose human annotator annotated the first 15 seconds

In [11]:
df = pd.read_csv( "example_audios/human-in-the-loop-training-example/annotations.csv" )
df.head()

Unnamed: 0,onset,offset,species,individual,clustername,filename,channelIndex
0,0.0,0.121816,zebra finch,bird1,U,audio.wav,0
1,0.178039,0.271743,zebra finch,bird1,U,audio.wav,0
2,0.290484,0.393559,zebra finch,bird1,U,audio.wav,0
3,0.440411,0.599709,zebra finch,bird1,A,audio.wav,0
4,0.599709,0.862082,zebra finch,bird1,B,audio.wav,0


### Check the available model for finetuning and select one

In [29]:
response = requests.post( 
    "http://localhost:8050/list-models-available-for-finetuning",
    headers = {"content-type":"application/json"}
).json()
response

{'response': [{'eta': '--:--:--',
   'model_name': 'whisperseg-base',
   'status': 'ready'},
  {'eta': '--:--:--', 'model_name': 'whisperseg-large', 'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg_base',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-base-v2.0',
   'status': 'ready'},
  {'eta': '--:--:--',
   'model_name': 'r3428-99dph-whisperseg-large',
   'status': 'ready'}]}

suppose the user selects the model "whisperseg-base"

### Give a new name to the finetuned model
For this the user can see the full list of existing models, and the frontend will check if the given new_model_name is unique

The list of existing model names can be obtained by the API calls below:

In [30]:
all_models = requests.post(  "http://localhost:8050/list-models-available-for-finetuning" ).json()["response"] + \
requests.post(  "http://localhost:8050/list-models-available-for-inference" ).json()["response"] + \
requests.post(  "http://localhost:8050/list-models-training-in-progress" ).json()["response"] 
all_models

[{'eta': '--:--:--', 'model_name': 'whisperseg-base', 'status': 'ready'},
 {'eta': '--:--:--', 'model_name': 'whisperseg-large', 'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg_base',
  'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg-base-v2.0',
  'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg-large',
  'status': 'ready'},
 {'eta': '--:--:--', 'model_name': 'whisperseg-base', 'status': 'ready'},
 {'eta': '--:--:--', 'model_name': 'whisperseg-large', 'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg_base',
  'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg-base-v2.0',
  'status': 'ready'},
 {'eta': '--:--:--',
  'model_name': 'r3428-99dph-whisperseg-large',
  'status': 'ready'}]

### Start submit training request

In [16]:
# audio_id
annotated_areas = [ { "onset":0, "offset":40 } ]  ## the first 15 seconds are annotated
human_labels = [dict(df.iloc[idx]) for idx in range(len(df)) ]  ## get the human_labels
for item in human_labels: ## to make it JSON serializable
    item["onset"] = float(item["onset"])
    item["offset"] = float(item["offset"])  
    item["channelIndex"] = int(item["channelIndex"])

new_model_name = "whisperseg-base-debug-v3.0"
inital_model_name = "whisperseg-base"
min_frequency = 0

In [17]:
response = requests.post( 
    "http://localhost:8050/finetune-whisperseg",
    data = json.dumps({
        "audio_id":audio_id,
        "annotated_areas":annotated_areas,
        "human_labels":human_labels,
        "new_model_name":new_model_name,
        "inital_model_name":inital_model_name,
        "min_frequency":min_frequency
    }),
    headers = {"content-type":"application/json"}
).json()

response

{'message': 'Training'}

Let's see the status of model being trained

In [23]:
requests.post(  "http://localhost:8050/list-models-training-in-progress" ).json()["response"] 

[]

### Use the finetuned model to segment the rest of the audios

In [24]:
response = requests.post( 
    "http://localhost:8050/get-labels",
    data = json.dumps( {
        "audio_id":audio_id,
        "annotated_areas":annotated_areas,
        "human_labels":human_labels,
        "model_name":"whisperseg-base-debug-v3.0",
        "min_frequency": 0
    }),
    headers = {"content-type":"application/json"}
).json()

In [17]:
# response["labels"]

In [25]:
out_df = pd.DataFrame( response["labels"])
out_df["filename"]="audio.wav"
out_df["channelIndex"] = 0
out_df = out_df[["onset", "offset", "species", "individual", "clustername", "filename", "channelIndex"]]

In [26]:
out_df

Unnamed: 0,onset,offset,species,individual,clustername,filename,channelIndex
0,0.000000,0.121816,zebra finch,bird1,U,audio.wav,0
1,0.178039,0.271743,zebra finch,bird1,U,audio.wav,0
2,0.290484,0.393559,zebra finch,bird1,U,audio.wav,0
3,0.440411,0.599709,zebra finch,bird1,A,audio.wav,0
4,0.599709,0.862082,zebra finch,bird1,B,audio.wav,0
...,...,...,...,...,...,...,...
416,204.103000,204.303000,zebra finch,bird1,C,audio.wav,0
417,204.310000,204.517000,zebra finch,bird1,A,audio.wav,0
418,204.520000,204.770000,zebra finch,bird1,B,audio.wav,0
419,204.853000,205.027000,zebra finch,bird1,A,audio.wav,0


In [28]:
out_df.to_csv("example_audios/human-in-the-loop-training-example/pred_annotations.csv", index = False)

This prediction can be rendered on the frontend.