In [1]:
from msclap import CLAP

# Load model (Choose between versions '2022' or '2023')
# The model weight will be downloaded automatically if `model_fp` is not specified
clap_model = CLAP(version = '2023', use_cuda=False)

CLAP_weights_2023.pth:   0%|          | 0.00/690M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
from esc50_dataset import ESC50
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

## Zero-shot Classification

In [4]:
# Load dataset
root_path = "./ESC-50-master" # Folder with ESC-50-master/
dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
prompt = 'this is the sound of '
y = [prompt + x for x in dataset.classes]

ESC-50-master.zip:  2.45 MB/s


Loading audio files


2000it [00:00, 13724.21it/s]


In [5]:
# Computing text embeddings
text_embeddings = clap_model.get_text_embeddings(y)

In [6]:
# Computing audio embeddings
y_preds, y_labels = [], []
for i in tqdm(range(len(dataset))):
    x, _, one_hot_target = dataset.__getitem__(i)
    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())


y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)
acc = accuracy_score(np.argmax(y_labels, axis=1), np.argmax(y_preds, axis=1))
print('ESC50 Accuracy {}'.format(acc))

100%|██████████| 2000/2000 [08:45<00:00,  3.81it/s]

ESC50 Accuracy 0.9385





## Zero-shot Prediction

In [10]:
# Define classes for zero-shot
# Should be in lower case and can be more than one word
classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
ground_truth = ['coughing']
# Add prompt
prompt = 'this is a sound of '
class_prompts = [prompt + x for x in classes]
#Load audio files
audio_files = [x for x in dataset.audio_paths[:20]]

In [11]:
# compute text embeddings from natural text
text_embeddings = clap_model.get_text_embeddings(class_prompts)

# compute the audio embeddings from an audio file
audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)

# compute the similarity between audio_embeddings and text_embeddings
similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

similarity = F.softmax(similarity, dim=1)
values, indices = similarity[0].topk(5)

In [12]:
# Print the results
print("Ground Truth: {}".format(ground_truth))
print("Top predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")

Ground Truth: ['coughing']
Top predictions:

        sneezing: 95.66%
        coughing: 2.85%
  brushing teeth: 0.73%
       breathing: 0.52%
drinking sipping: 0.24%


## Audio Captioning

In [15]:
# Load and initialize CLAP
clap_model = CLAP(version='clapcap', use_cuda=False)

# Load audio files
audio_files = [x for x in dataset.audio_paths[:5]]

# Generate captions with progress bar
captions = []
for path in tqdm(audio_files, desc="Generating captions", ncols=80):
    caption = clap_model.generate_caption(
        [path],
        resample=True,
        beam_size=5,
        entry_length=67,
        temperature=0.01
    )[0]
    captions.append(caption)

# Print results
for audio, caption in zip(audio_files, captions):
    print(f"Audio file: {audio}\nGenerated caption: {caption}\n")

Generating captions: 100%|████████████████████████| 5/5 [00:43<00:00,  8.60s/it]

Audio file: ./ESC-50-master/ESC-50-master/audio/1-100032-A-0.wav
Generated caption: A dog barks and then barks again. 

Audio file: ./ESC-50-master/ESC-50-master/audio/1-100038-A-14.wav
Generated caption: A variety of birds are chirping and chirping in the background. 

Audio file: ./ESC-50-master/ESC-50-master/audio/1-100210-A-36.wav
Generated caption: A machine is running and then it starts to run again. 

Audio file: ./ESC-50-master/ESC-50-master/audio/1-100210-B-36.wav
Generated caption: A machine is running and then starts to whirr. 

Audio file: ./ESC-50-master/ESC-50-master/audio/1-101296-A-19.wav
Generated caption: A thunderstorm is making its way through the forest. 




