In [None]:
from msclap import CLAP
import torch.nn.functional as F
import gradio as gr

In [None]:
# Define classes for zero-shot
# Should be in lower case and can be more than one word
classes = ['clap','clapping','snapping', 'snap', 'smack', 'talking', 'speaking', 'voice']
ground_truth = ['clapping']
# Add prompt
prompt = 'this is a sound of '
class_prompts = [prompt + x for x in classes]
file = ['tutorials/audio-samples/angelo-clap1.mp3']

In [None]:
def result_to_str(values, indices):
    str = list()
    for values, indices in zip(values, indices):
        str.append(f"{classes[index]:>16s}: {100 * values.item():.2f}%")
    return str


In [None]:
def print_result(strlist):
    #Print the results
    print("Ground Truth: {}".format(ground_truth))
    print("Top predictions:\n")
    print(strlist)
    

In [None]:
def classify(audio_files):
    # Load and initialize CLAP
    # Setting use_cuda = True will load the model on a GPU using CUDA  
    clap_model = CLAP(version= '2023', use_cuda= False)
    
    # compute text embeddings from natural text
    text_embeddings = clap_model.get_text_embeddings(class_prompts)
    
    # compute the audio embeddings from an audio file
    audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)
    
    # compute the similarity between audio_embeddings and text_embeddings
    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

    similarity = F.softmax(similarity, dim=1)
    values, indices = similarity[0].topk(5)
    
    print_result(values, indices)
    
    return result_to_str(values, indices)
    
    

In [None]:
test = classify(file)

In [None]:
print(test)

In [None]:
def transcribe(audio):
    output = classify(audio,labels=["snap","snapping","clap","clapping","talking","voice"])["text"]
    return output

In [None]:
gr.Interface(
    fn=classify,
    inputs=[gr.Audio(sources="microphone",type="filepath")],
    outputs=["textbox"],
    live=True).launch()