In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd

In [2]:
from torch.utils.data import Dataset
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import os
import torch.nn as nn
import torch

class AudioDataset(Dataset):
    def __init__(self, root: str, download: bool = True):
        self.root = os.path.expanduser(root)
        if download:
            self.download()

    def __getitem__(self, index):
        raise NotImplementedError

    def download(self):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError


class ESC50(AudioDataset):
    base_folder = 'ESC-50-master'
    url = "https://github.com/karoldvl/ESC-50/archive/master.zip"
    filename = "ESC-50-master.zip"
    num_files_in_dir = 2000
    audio_dir = 'audio'
    label_col = 'category'
    file_col = 'filename'
    meta = {
        'filename': os.path.join('meta','esc50.csv'),
    }

    def __init__(self, root, reading_transformations: nn.Module = None, download: bool = True):
        super().__init__(root)
        self._load_meta()

        self.targets, self.audio_paths = [], []
        self.pre_transformations = reading_transformations
        print("Loading audio files")
        # self.df['filename'] = os.path.join(self.root, self.base_folder, self.audio_dir) + os.sep + self.df['filename']
        self.df['category'] = self.df['category'].str.replace('_',' ')

        for _, row in tqdm(self.df.iterrows()):
            file_path = os.path.join(self.root, self.base_folder, self.audio_dir, row[self.file_col])
            self.targets.append(row[self.label_col])
            self.audio_paths.append(file_path)

    def _load_meta(self):
        path = os.path.join(self.root, self.base_folder, self.meta['filename'])

        self.df = pd.read_csv(path)
        self.class_to_idx = {}
        self.classes = [x.replace('_',' ') for x in sorted(self.df[self.label_col].unique())]
        for i, category in enumerate(self.classes):
            self.class_to_idx[category] = i

    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        file_path, target = self.audio_paths[index], self.targets[index]
        idx = torch.tensor(self.class_to_idx[target])
        one_hot_target = torch.zeros(len(self.classes)).scatter_(0, idx, 1).reshape(1,-1)
        return file_path, target, one_hot_target

    def __len__(self):
        return len(self.audio_paths)

    def download(self):
        # Download file using requests
        import requests
        file = Path(self.root) / self.filename
        if file.is_file():
            return
        
        r = requests.get(self.url, stream=True)

        # To prevent partial downloads, download to a temp file first
        tmp = file.with_suffix('.tmp')
        tmp.parent.mkdir(parents=True, exist_ok=True)
        with open(tmp, 'wb') as f:
            pbar = tqdm(unit=" MB", bar_format=f'{file.name}: {{rate_noinv_fmt}}')

            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    pbar.update(len(chunk) / 1024 / 1024)
                    f.write(chunk)
                    
        # move temp file to correct location
        tmp.rename(file)
        
        # # extract file
        from zipfile import ZipFile
        with ZipFile(os.path.join(self.root, self.filename), 'r') as zip:
            zip.extractall(path=self.root)

In [11]:
"""
This is an example using CLAP to perform zeroshot
    classification on ESC50 (https://github.com/karolpiczak/ESC-50).
"""

from msclap import CLAP
from esc50_dataset import ESC50
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Load dataset
root_path = "root_path" # Folder with ESC-50-master/
dataset = ESC50(root=root_path, download=True) #If download=False code assumes base_folder='ESC-50-master' in esc50_dataset.py
prompt = 'this is the sound of '
y = [prompt + x for x in dataset.classes]

# Load and initialize CLAP
clap_model = CLAP(version = '2023', use_cuda=False)

# Computing text embeddings
text_embeddings = clap_model.get_text_embeddings(y)

# Computing audio embeddings
y_preds, y_labels = [], []
for i in tqdm(range(len(dataset))):
    x, _, one_hot_target = dataset.__getitem__(i)
    audio_embeddings = clap_model.get_audio_embeddings([x], resample=True)
    similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())



Loading audio files


2000it [00:00, 56551.40it/s]
100%|██████████| 2000/2000 [01:58<00:00, 16.92it/s]


In [14]:

y_labels, y_preds = np.concatenate(y_labels, axis=0), np.concatenate(y_preds, axis=0)


# Convert one-hot encoded labels and predictions to class indices
true_classes = np.argmax(y_labels, axis=1)
pred_classes = np.argmax(y_preds, axis=1)






# Compute metrics
acc = accuracy_score(true_classes, pred_classes)
precision = precision_score(true_classes, pred_classes, average='weighted')
recall = recall_score(true_classes, pred_classes, average='weighted')
conf_matrix = confusion_matrix(true_classes, pred_classes)

# Print results
print(f'ESC50 Accuracy: {acc:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

# Optionally display the confusion matrix as a DataFrame for clarity
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=[f"True {cls}" for cls in range(len(dataset.classes))], 
                              columns=[f"Pred {cls}" for cls in range(len(dataset.classes))])

print("\nConfusion Matrix:")
print(conf_matrix_df)

ESC50 Accuracy: 0.9385
Precision: 0.9520
Recall: 0.9385

Confusion Matrix:
         Pred 0  Pred 1  Pred 2  Pred 3  Pred 4  Pred 5  Pred 6  Pred 7  \
True 0       36       0       0       0       0       0       0       0   
True 1        0      40       0       0       0       0       0       0   
True 2        0       0      40       0       0       0       0       0   
True 3        0       0       0      40       0       0       0       0   
True 4        0       0       0       0      37       0       0       0   
True 5        0       0       0       0       0      39       0       0   
True 6        0       0       0       0       0       0      37       0   
True 7        0       0       0       0       0       0       0      40   
True 8        0       0       0       0       0       0       0       0   
True 9        0       0       0       0       0       0       0       0   
True 10       0       0       0       0       0       0       0       0   
True 11       0       0  