In [1]:
#install following libraries:
#   CLAP: pip install msclap        //requires python version 3.11

from msclap import CLAP
import torch.nn.functional as F


In [2]:
# Load and initialize CLAP
# Setting use_cuda = True will load the model on a GPU using CUDA
clap_model = CLAP(version = '2023', use_cuda=False)

In [3]:
# Define classes for zero-shot
# Should be in lower case and can be more than one word
classes = ['clap','clapping','snapping', 'snap', 'smack', 'talking', 'speaking', 'voice']
ground_truth = ['clapping']
# Add prompt
prompt = 'this is a sound of '
class_prompts = [prompt + x for x in classes]

# compute text embeddings from natural text
text_embeddings = clap_model.get_text_embeddings(class_prompts)

In [5]:
#Load audio files
audio_files = ['tutorials/audio-samples/angelo-clap1.mp3']

# compute the audio embeddings from an audio file
audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)

# compute the similarity between audio_embeddings and text_embeddings
similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

In [6]:
similarity = F.softmax(similarity, dim=1)
values, indices = similarity[0].topk(5)

In [7]:
# Print the results
print("Ground Truth: {}".format(ground_truth))
print("Top predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")

Ground Truth: ['clapping']
Top predictions:

           smack: 83.25%
        snapping: 9.13%
            snap: 4.36%
        clapping: 2.11%
            clap: 1.03%
