# Create Embedding Tables

This notebook is used to extract audio embeddings from soundly-80k and save them to disk.

In [3]:
import torch
import torchaudio
import os
import IPython.display as ipd
import pandas as pd
from collections import Counter
from experiments.dataset import DatasetUCS
from utils.seed_everything import seed_everything
from utils.set_torch_device import set_torch_device
import json
import librosa
from msclap import CLAP
import laion_clap
import numpy as np
from tqdm import tqdm

In [4]:
# Utils and Helpers
device, device_name = set_torch_device()
print(f"Torch device set to: {device_name}\n")

# Load dataset paths
with open ('experiments/configs/zero-shot.json', 'r') as f:
    settings = json.load (f)

data = pd.read_csv(settings['dataset_drive'] + settings['dataset_path'])
data = data[data['category'] != 'other'] # Filter out sounds without without a category.
data.reset_index(drop=True)
dataset_paths = DatasetUCS(data, settings, device=device, return_type="path")
print(f"Number of classes: {len(data['category'].unique())}")
print(f"Class names: {data['category'].unique()}")
#dataset_wavs = DatasetUCS(data, settings, device=device, return_type="waveform")

Torch device set to: NVIDIA GeForce RTX 3060

Number of classes: 81
Class names: ['ambience' 'objects' 'rain' 'water' 'doors' 'weather' 'vehicles' 'boats'
 'communications' 'footsteps' 'voices' 'musical' 'crowds' 'bells'
 'animals' 'tools' 'glass' 'human' 'movement' 'metal' 'fight' 'vegetation'
 'wood' 'plastic' 'trains' 'machines' 'alarms' 'designed' 'swooshes'
 'sports' 'guns' 'rubber' 'air' 'fire' 'bullets' 'cloth' 'drawers' 'birds'
 'games' 'farts' 'motors' 'food & drink' 'whistles' 'windows'
 'destruction' 'mechanical' 'electricity' 'creatures' 'aircraft' 'rocks'
 'liquid & mud' 'wings' 'weapons' 'equipment' 'gore' 'paper' 'dirt & sand'
 'user interface' 'computers' 'beeps' 'cartoon' 'fireworks' 'scifi'
 'clocks' 'natural disaster' 'explosions' 'toys' 'ceramics' 'leather'
 'snow' 'rope' 'chains' 'ice' 'chemicals' 'lasers' 'magic' 'geothermal'
 'robots' 'horns' 'archived' 'wind']


## Determinism Test

Some encoders provide deterministic embeddings while others do not. This section explores this to be able to devise a strategy for saving and evaluating the models from embeddings saved to disk.

In [6]:
# Select an arbitrary audio file path from the dataset
file_idx = 10
path = dataset_paths[file_idx][0] # Returns (path, label, class)
path = path.replace(path[:3], settings['dataset_drive']) # Change the dataset drive

# Load Models
laion_clap_model = laion_clap.CLAP_Module(enable_fusion=False)
laion_clap_model.load_ckpt() # Load default checkpoint: 630k-audioset-best.pt
msclap_model = CLAP(version = '2023', use_cuda=True) # version can be 2022 or 2023

# Test MSCLAP
label_embedding_msclap = msclap_model.get_text_embeddings(data['category'].unique().tolist())
print(f"MSCLAP Text Embeddings:\n{label_embedding_msclap}\n") # Deterministic
audio_embedding_msclap = msclap_model.get_audio_embeddings([path])
print(f"MSCLAP Audio Embeddings:\n{audio_embedding_msclap[:5]}\n") # Changes every execution

# Test LAION CLAP
label_embedding_msclap = laion_clap_model.get_text_embedding(data['category'].unique().tolist(), use_tensor=True).detach()
print(f"LAION CLAP Text Embeddings:\n{label_embedding_msclap}\n") # Deterministic
audio_embedding_laion = laion_clap_model.get_audio_embedding_from_filelist([path], use_tensor=True).detach()
print(f"LAION CLAP Audio Embeddings:\n{audio_embedding_laion[:5]}\n") # Changes every execution

Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

## Create k Folds

In [10]:
# Determine how many samples can fit in memory at once
from typing import Tuple

def memory_report(x:Tuple,device):
    """
    Args:
    - x: a tuple of objects to check the memory for.
    """

    obj_info = {}
    # Iterate over x
    for obj in x:
        # Check object types
        if (isinstance(obj, torch.Tensor)):
            if ('tensors' not in obj_info.keys() or isinstance(obj_info['tensors'], int) == False): 
                obj_info['tensors'] = 0
            mem = obj.element_size()
            for dim in obj.size():
                if(dim > 0):
                    mem *= dim

            obj_info['tensors'] += mem
        else:
            # Other objects that don't need special handling
            if ('other' not in obj_info.keys() or isinstance(obj_info['other'], int) == False): 
                    obj_info['other'] = 0

    # Sum all categories
    obj_info['bytes_total'] = 0
    for key in obj_info.keys():
        obj_info['bytes_total'] += obj_info[key]

    return obj_info

    total_mem = torch.cuda.get_device_properties(device).total_memory
    reserved_mem = torch.cuda.memory_reserved(device)
    allocated_mem = torch.cuda.memory_allocated(device)
    print(f"Total Memory: {total_mem}")
    print(f"Reserved Memory: {reserved_mem}")
    print(f"Allocated Memory: {allocated_mem}")
    print(f"Free Memory: {total_mem-reserved_mem-allocated_mem}")

tensor = torch.rand((1,81,512))
obj_info = memory_report((tensor, msclap_model), device)
print(obj_info)
print(obj_info['bytes_total'] * 1e-9)

{'tensors': 165888, 'other': 0, 'bytes_total': 331776}
0.00033177600000000004


# Extraction
Due to memory constraints that don't allow all embeddings to be extracted at once, a number of folds are created that samples at least once from each class. If no overlap between folds or duplicate samples is unwanted, the total number of folds will be equal to the class with the least amount of samples.

The 200-fold embeddings use seeds '1337' and '4550'

In [14]:
allow_overlap = False
# number of folds. If no overlap is allowed, the smallest class determines the max folds.
max_folds = data['category'].value_counts().min()
k_folds = 200 if (allow_overlap) else max_folds
# number of samples per category per fold
n_samples = 5 if (allow_overlap) else data['category'].value_counts().min() // k_folds
# n_samples depends on RAM and the model sizes.
#n_samples = 1
seed = 1337
folds = []
seed_everything(seed)
for k in range(k_folds):
    #group = data.groupby('category', group_keys=False).apply(lambda x: x.sample(n=5))
    group = data.groupby('category', group_keys=False)
    fold = group.sample(n=n_samples, replace=allow_overlap)
    # Disable between-fold contamination
    if (allow_overlap == False):
        data.drop(fold.index.to_list(), inplace=True)

    # Used for extracting embeddings
    folds.append(fold) 
    # Save folds to disk for later reference
    fold.to_csv(settings['dataset_drive'] + f"datasets/soundly/embeddings/zs_fold_{k+1}.csv",index=False)

In [15]:
# Save audio embedding folds to disk.
models = ['laion', 'msclap'] # Used to reuse the model variable to save memory
for model_name in tqdm(models):
    # Load models
    if model_name == 'msclap':
        print(f"Loading MSCLAP...")
        model = CLAP(version = '2023', use_cuda=True) # version can be 2022 or 2023
        print(f"MSCLAP loaded.")

    if model_name == 'laion': 
        print(f"Loading LAION CLAP...")
        model = laion_clap.CLAP_Module(enable_fusion=False)
        model.load_ckpt() # Load default checkpoint: 630k-audioset-best.pt
        print(f"LAION CLAP loaded.")

    for i, fold in tqdm(enumerate(folds)):
        paths = []
        for path in fold['path']:
            paths.append(path.replace(path[:3], settings['dataset_drive'])) # Change the dataset drive
        
        print("Computing audio embeddings...")

        if model_name == 'msclap':
            audio_embedding = torch.tensor(model.get_audio_embeddings(paths, resample=True)) # Remove batch 
            
        if model_name == 'laion': 
            audio_embedding = model.get_audio_embedding_from_filelist(paths, use_tensor=True)
        
        print(f"Extracted fold {i+1}. Saving to disk...")
        torch.save(audio_embedding, settings['dataset_drive'] + f"datasets/soundly/soundly_80k-audio_embeddings-{model_name}-fold_{i+1}.pt")
    

  0%|          | 0/2 [00:00<?, ?it/s]

Loading LAION CLAP...
Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.laye



Computing audio embeddings...
Extracted fold 1. Saving to disk...




Computing audio embeddings...


1it [02:33, 153.29s/it]
  0%|          | 0/2 [03:28<?, ?it/s]


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
class NpEncoder(json.JSONEncoder):
    """Source: https://stackoverflow.com/questions/50916422/python-typeerror-object-of-type-int64-is-not-json-serializable"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [8]:
audio_embedding_msclap = msclap_model.get_audio_embeddings(paths[:50], resample=True)
torch_audio_embeddings_msclap = torch.tensor(np.array(audio_embedding_msclap.detach().cpu()))
torch.save(torch_audio_embeddings_msclap, "D:/datasets/soundly_80k-audio_embeddings-msclap.pt")

In [10]:
audio_embedding_msclap.shape

torch.Size([50, 1024])

In [42]:
text_data = ["", "steel"] # Bug in CLAP hook.py that has yet to be merged requires two or more text prompts.
text_data
text_embed = laion_clap_model.get_text_embedding(text_data, use_tensor=False)[1]
print(text_embed.shape)

(512,)


In [47]:
cos = torch.nn.CosineSimilarity(dim=1)
sim = cos(torch.tensor(audio_embed), torch.tensor(text_embed))
print(sim)

tensor([-0.0532])


In [3]:
audioModel = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
textModel = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

In [4]:
inputs = tokenizer(["steel"], padding=True, return_tensors="pt")

outputs = textModel(**inputs)
text_embeds = outputs.text_embeds

In [26]:
audio_data = torch.tensor(audio_data).unsqueeze(0)

In [24]:
def explore_dataset_ucs(data_path:str='E:/Soundly Library juni 2023/Soundly Pro/', analyze_sample_rate=False):
    """
    Iterates over a UCS dataset.
    """
    files_count_dict = {}
    samples_rates_dict = {}
    channels_dict = {}
    bit_rates_dict = {}
    encodings_dict = {}
    for i, (root, dirnames, filenames) in enumerate(os.walk(data_path, topdown=True)):
        
        if(i == 0):
            all_categories = dirnames

        root = root.replace('\\', '/')
        split_root = root.split('/')

        for category in all_categories:
            if category in split_root:
                if (category in files_count_dict.keys()):
                    files_count_dict[category] += len(filenames)
                else: 
                    files_count_dict[category] = len(filenames)

                if (len(filenames) != 0 and analyze_sample_rate == True):
                    for files in filenames:
                        try:
                            metadata = torchaudio.info(root+'/'+files)
                            if (metadata.sample_rate in samples_rates_dict.keys()):
                                samples_rates_dict[metadata.sample_rate] += 1
                                channels_dict[metadata.num_channels] += 1
                                bit_rates_dict[metadata.bits_per_sample] += 1
                                encodings_dict[metadata.encoding] += 1
                            else:
                                samples_rates_dict[metadata.sample_rate] = 1
                                channels_dict[metadata.num_channels] = 1
                                bit_rates_dict[metadata.bits_per_sample] = 1
                                encodings_dict[metadata.encoding] = 1
                        except:
                            print(f"File {files} could not be opened.")
                        

    return files_count_dict, samples_rates_dict, channels_dict, bit_rates_dict, encodings_dict

data, rates, channels, bit_rates, encodings = explore_dataset_ucs()

File Ambience, Tropical, Amazonas, Night, Crickets, Frogs, Gran Sabana, Venezuela.wav could not be opened.
File .DS_Store could not be opened.
File ._.DS_Store could not be opened.
File .DS_Store could not be opened.
File ._.DS_Store could not be opened.


In [3]:
df = pd.read_csv("D:/datasets/soundly.csv")
df['path'].iloc[0]

In [None]:
# Get audio embeddings from audio data
audio_data, _ = librosa.load('/home/data/test_clap_short.wav', sr=48000) # sample rate should be 48000
audio_data = audio_data.reshape(1, -1) # Make it (1,T) or (N,T)
audio_embed = model.get_audio_embedding_from_data(x = audio_data, use_tensor=False)

# Get text embedings from texts:
text_data = ["Dark metallic"] 
text_embed = model.get_text_embedding(text_data)
print(text_embed)
print(text_embed.shape)

In [2]:
# Load the Soundly Library, to be saved as a .csv file.

df = pd.DataFrame()

audio_path ='E:/Soundly Library juni 2023/Soundly Pro' 
#audio_path = 'E:/Soundly Library juni 2023/Soundly Pro/Bells/Handbell'

audio_files = []
metadata = {
    'path' : [],
    'channels': [],
    'samples' : [],
    'sample_rate' : [],
    'bit_depth' : [],
    'category' : [],
    'sub_category' : []
}
for path, dirs, files in os.walk(audio_path):
    if ( False ):
        if(len(dirs) != 0):
            print(dirs)
        if(len(files) != 0):
            print(files)

    for file in files:
        try:
            file_path = path + "/" + file
            file_path = file_path.replace("\\", "/")
            path_split = file_path.split("/")[3:] # Anything before element 4 is static, the base of the file name.

            # Default categories, given if no other category info is found.
            category = 'other'
            sub_category = 'none'

            if (len(path_split) == 3):
                category = path_split[0]
                sub_category = path_split[1]

            data = torchaudio.info(file_path)
            
            metadata['path'].append(file_path)
            metadata['channels'].append(data.num_channels)
            metadata['samples'].append(data.num_frames)
            metadata['sample_rate'].append(data.sample_rate)
            metadata['bit_depth'].append(data.bits_per_sample)
            metadata['category'].append(category.lower())
            metadata['sub_category'].append(sub_category.lower())
        except:
            print(f"File '{file}' could not be opened.")
        