In [None]:
# Peter John Enriquez
# 2019-03086
# ECE 197 DL Z-MWZ

## Project 2 - Zero shot Key Word Spotting (KWS) using ImageBind
`ImageBind` is a large multimodal model which learns a joint embedding across six different modalities - images, text, audio, depth, thermal, and IMU data. It enables novel emergent applications ‘out-of-the-box’ including cross-modal retrieval, composing modalities with arithmetic, cross-modal detection and generation. It has demonstrated competitive zero-shot capabilities and here, we are to test its accuracy in terms of zero-shot keyword spotting.

`KWS`, using the `speech commands v2` dataset, is made of 37-category single word utterances like "Yes", "No", "Left", "Right", etc (including silence and unknown) and can be downloaded from torchaudio datasets.

In [None]:
%pip install torch
%pip install torchvision
%pip install torchaudio

# add. for ImageBind
%pip install pytorchvideo 
%pip install timm
%pip install ftfy
%pip install regex
%pip install einops
%pip install fvcore
%pip install decord
%pip install iopath
%pip install numpy
%pip install matplotlib
%pip install types-regex
%pip install mayavi
%pip install cartopy

# add. for UI
%pip install gradio
%sudo apt-get install libportaudio2
%pip install sounddevice

# add. for validation run
%pip install torchmetrics



In [1]:
# add. for ImageBind
import data
import torch
from models import imagebind_model
from models.imagebind_model import ModalityType

# add. for KWS dataset
import os
import torchaudio
import numpy as np
import torchaudio.datasets

# add. for inputs
import gradio as gr
import random

# add. for validation run
from torchmetrics import Accuracy




## Directory Arrangement
The following directory arrangement was used to run this code. If errors regarding directories are encountered, you can review the following:
* .assets
  * bird_audio.wav
  ...
* .checkpoints
  * imagebind_huge.pth
* bpe
  * bpe_simple_vocab_16e6.txt.gz
* data
  * speech_commands
    * SpeechCommands
      * speech_commands_v0.02
        * _background_noise_
* models
  * \_\_init\_\_.py
* test_log
  * validation_run1
  * validation_run2
* data.py  
* project2_demo.ipynb
* project2_valtest.py

## ImageBind Zero-Shot Key Word Spotting Demonstration
The following code is a demonstration of ImageBind that takes in an input from the user. An audio from the KWS test split can be randomly picked as input by checking the checkbox or the user can record his/her own voice by clicking the 'record from microphone' button. 

In [None]:
## FOR DEMO
device = "cpu"
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# DATASET SETUP
CLASSES = ['silence', 'unknown', 'backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow',
        'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no',
        'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three',
        'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']

# make a dictionary from CLASSES to integers
CLASS_TO_IDX = {c: i for i, c in enumerate(CLASSES)}
#print(CLASS_TO_IDX)

data_path = os.path.join('data','speech_commands')
if not os.path.exists(data_path):
        os.makedirs(data_path, exist_ok=True)

kws_test_dataset = torchaudio.datasets.SPEECHCOMMANDS(data_path, download=True, subset='testing')

# INSTANTIATE MODEL
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

def demo(kws_state=False, audio_input=None):
    # INPUT DATA
    if kws_state is True: #use random kws test data
        list_length = kws_test_dataset.__len__()
        random_index = random.randint(0, list_length-1)
        audio_rel_path, sample_rate , target, _, _ = kws_test_dataset.get_metadata(random_index)
        audio_path = os.path.join(data_path, 'SpeechCommands', audio_rel_path )

    elif audio_input is not None:
        target = None
        sample_rate = 44100 #gradio.audio default sample rate
        audio_path = audio_input
    
    # LOAD DATA
    inputs = {  ModalityType.TEXT: data.load_and_transform_text(CLASSES, device),
                ModalityType.AUDIO: data.load_and_transform_audio_data([audio_path], device, target_length=204,sample_rate=sample_rate,num_mel_bins=128), #def. target length=204
            } 
    with torch.no_grad():
        embeddings = model(inputs)

    text_probs = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)

    index = np.argmax(text_probs.cpu().numpy())
    prediction = CLASSES[index]
    #print("Label:", prediction) 

    return target, prediction, audio_path

# Define the input interfaces
button_input = gr.Checkbox(label="Use Random KWS Test Data")
audio_input = gr.Audio(label="Audio Input", source="microphone", type="filepath")

# Define the output interfaces
output_target = gr.Textbox(label="Target")
output_prediction = gr.Textbox(label="Prediction")
output_audio = gr.Audio(label="Audio Output")

# Define the main Gradio interface
iface = gr.Interface(
    fn=demo,
    inputs=[button_input, audio_input],
    outputs=[output_target, output_prediction, output_audio],
    title="Zero-Shot Keyword Spotting using ImageBind",
    description="Enter text or record audio to make predictions.",
    examples=[[None, ".assets/bird_audio.wav"],[None,".assets/dog_audio.wav" ],],
)

# Launch the interface
iface.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.






## Validation Run
The following code was used to run a validation/test on a `pretrained ImageBind` using the `KWS test split`. It is the same code that `project2_valtest.py` has, and the results of the run and text file compilation of the targets and predictions are under the `test_log` folder. Two runs were made, and both runs result in the same accuracy of 0.028. The summary statistics is shown below:
* **Validation Run 1**
  * accuracy: 0.02762380801141262
  * correct predictions: 304
  * number of datapoints: 11005
  * unaccounted data(error):0
  * Manual accuracy calculation: 0.027623807360290777
  * Execution time: 3796.04176735878 seconds
* **Validation Run 2**
  * accuracy: 0.02762380801141262
  * correct predictions: 304
  * number of datapoints: 11005
  * unaccounted data(error):0
  * Manual accuracy calculation: 0.027623807360290777
  * Execution time: 5979.099710226059 seconds

In [None]:
## FOR VALIDATION RUN
start_time = time.time()

#device = "cpu"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# SETUP DATASET
class args:
    path = os.path.join('data','speech_commands')

CLASSES = ['silence', 'unknown', 'backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow',
               'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no',
               'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three',
               'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']
    
# make a dictionary from CLASSES to integers
CLASS_TO_IDX = {c: i for i, c in enumerate(CLASSES)}
print(CLASS_TO_IDX)

if not os.path.exists(args.path):
        os.makedirs(args.path, exist_ok=True)

kws_test_dataset = torchaudio.datasets.SPEECHCOMMANDS(args.path, download=True, subset='testing')


# INSTANTIATE MODEL
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# FOR PREDICTION
list_length = kws_test_dataset.__len__()
targets = []
predictions = []
correct_count = 0
accounted_count = 0
error_count = 0
for index in range(list_length):
    try:
        audio_rel_path, sample_rate , label, _, _ = kws_test_dataset.get_metadata(index)
        audio_path = os.path.join(args.path, 'SpeechCommands', audio_rel_path )
        #print('audio path: ', audio_path)
        #print('label: ', label)
        targets.append(CLASS_TO_IDX[label])
        # Load data
        inputs = {ModalityType.TEXT: data.load_and_transform_text(CLASSES, device), 
                  ModalityType.AUDIO: data.load_and_transform_audio_data([audio_path], device, clip_duration=1, target_length=204,sample_rate=sample_rate,num_mel_bins=128), #default target length=204
        } 

        with torch.no_grad():
            embeddings = model(inputs)

        text_probs = torch.softmax(embeddings[ModalityType.AUDIO] @ embeddings[ModalityType.TEXT].T, dim=-1)
        index = np.argmax(text_probs.cpu().numpy())
        predictions.append(index)

        if CLASS_TO_IDX[label]==index:
            correct_count += 1
    except:
        error_count += 1
        continue
    accounted_count += 1

# Save a Copy of the Test Run Data
test_run_data = [targets, predictions]
with open('test_run_data.txt', 'w') as f:
    for sublist in test_run_data:
        line = ' '.join([str(item) for item in sublist])
        f.write(line + '\n')

# Compute Accuracy
targets = torch.tensor(targets)
predictions = torch.tensor(predictions)
accuracy = Accuracy('multiclass',num_classes=len(CLASSES))
accuracy(predictions, targets)
IMB_accuracy = accuracy.compute()

end_time = time.time()
execution_time = end_time - start_time

print("Run Summary:")
print(f'  accuracy: {IMB_accuracy}')
print(f'  correct predictions: {correct_count}')
print(f'  number of datapoints: {accounted_count}')
print(f'  unaccounted data(error):{error_count}')  
print(f'  Manual accuracy calculation: {correct_count/accounted_count}' )
print(f"  Execution time: {execution_time} seconds")


## Comparison
The following table shows a comparison of ImageBind Speech-to-Text performance on the `speech commands v2` dataset and other SOTA models. It can be observed that ImageBind has a noticeably poorer performance in terms of accuracy and one possible reason for this is because ImageBind is trained and tested on binding text to sound events rather than speech. It will therefore perform better when the audio input is a sound event like in the case of the example .wav files in the demo. 

| Models | Training Type | Evaluation | Accuracy | Notes |
|----------|----------|----------|----------|----------|
| ImageBind | Self-Supervised | Zero-shot | 0.028   | None | 
| [M2D](https://paperswithcode.com/paper/masked-modeling-duo-learning-representations) | Self-Supervised | Not ZS | 0.985 | None |
| [wav2vec2-conformer-rel-pos-large ](https://huggingface.co/juliensimon/wav2vec2-conformer-rel-pos-large-finetuned-speech-commands) | Self-Supervised | Not ZS | 0.972 | None |
| [AST-P(Gong et al.)](https://huggingface.co/MIT/ast-finetuned-speech-commands-v2) | Supervised | Not ZS | 0.981 | None |
| [hubert-base-ls960](https://huggingface.co/superb/hubert-base-superb-ks) | Supervised | Zero-Shot | 0.963 | Limited to 10+2 classes |
