# Create Embedding Tables

This notebook is used to extract audio embeddings from soundly-80k and save them to disk.

In [1]:
import torch
import torchaudio
import os
import pandas as pd
from experiments.utils.dataset import DatasetUCS
from experiments.utils.seed_everything import seed_everything
from experiments.utils.set_torch_device import set_torch_device
import json
from msclap import CLAP
import laion_clap
import numpy as np
from tqdm import tqdm
import time

In [2]:
# Load Models
#laion_clap_model = laion_clap.CLAP_Module(enable_fusion=False)
#laion_clap_model.load_ckpt() # Load default checkpoint: 630k-audioset-best.pt
msclap_model = CLAP(version = '2023', use_cuda=True) # version can be 2022 or 2023

In [3]:
# Utils and Helpers
device, device_name = set_torch_device()
print(f"Torch device set to: {device_name}\n")

# Load dataset paths
with open ('experiments/configs/kfold17-zero_shot.json', 'r') as f:
    settings = json.load (f)

data = pd.read_csv(settings['dataset_drive'] + settings['dataset_path'])
data = data[data['category'] != 'other'] # Filter out sounds without without a category.
data.reset_index(drop=True)
dataset_paths = DatasetUCS(data, settings, device=device, return_type="path")
print(f"Number of classes: {len(data['category'].unique())}")
print(f"Class names: {data['category'].unique()}")
#dataset_wavs = DatasetUCS(data, settings, device=device, return_type="waveform")

Torch device set to: NVIDIA GeForce RTX 3060

Number of classes: 81
Class names: ['ambience' 'objects' 'rain' 'water' 'doors' 'weather' 'vehicles' 'boats'
 'communications' 'footsteps' 'voices' 'musical' 'crowds' 'bells'
 'animals' 'tools' 'glass' 'human' 'movement' 'metal' 'fight' 'vegetation'
 'wood' 'plastic' 'trains' 'machines' 'alarms' 'designed' 'swooshes'
 'sports' 'guns' 'rubber' 'air' 'fire' 'bullets' 'cloth' 'drawers' 'birds'
 'games' 'farts' 'motors' 'food & drink' 'whistles' 'windows'
 'destruction' 'mechanical' 'electricity' 'creatures' 'aircraft' 'rocks'
 'liquid & mud' 'wings' 'weapons' 'equipment' 'gore' 'paper' 'dirt & sand'
 'user interface' 'computers' 'beeps' 'cartoon' 'fireworks' 'scifi'
 'clocks' 'natural disaster' 'explosions' 'toys' 'ceramics' 'leather'
 'snow' 'rope' 'chains' 'ice' 'chemicals' 'lasers' 'magic' 'geothermal'
 'robots' 'horns' 'archived' 'wind']


## Determinism Test

Some encoders provide deterministic embeddings while others do not. This section explores this to be able to devise a strategy for saving and evaluating the models from embeddings saved to disk.

In [7]:
# Select an arbitrary audio file path from the dataset
file_idx = 10
path = dataset_paths[file_idx][0] # Returns (path, label, class)
path = path.replace(path[:3], settings['dataset_drive']) # Change the dataset drive

# Test MSCLAP
label_embedding_msclap = msclap_model.get_text_embeddings(data['category'].unique().tolist())
print(f"MSCLAP Text Embeddings:\n{label_embedding_msclap}\n") # Deterministic
audio_embedding_msclap = msclap_model.get_audio_embeddings([path])
print(f"MSCLAP Audio Embeddings:\n{audio_embedding_msclap[:5]}\n") # Changes every execution

# Test LAION CLAP
label_embedding_laion = laion_clap_model.get_text_embedding(data['category'].unique().tolist(), use_tensor=True).detach()
print(f"LAION CLAP Text Embeddings:\n{label_embedding_laion}\n") # Deterministic
audio_embedding_laion = laion_clap_model.get_audio_embedding_from_filelist([path], use_tensor=True).detach()
print(f"LAION CLAP Audio Embeddings:\n{audio_embedding_laion[:5]}\n") # Changes every execution

Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

## Create k Folds

In [10]:
# Determine how many samples can fit in memory at once
from typing import Tuple

def memory_report(x:Tuple,device):
    """
    Args:
    - x: a tuple of objects to check the memory for.
    """

    obj_info = {}
    # Iterate over x
    for obj in x:
        # Check object types
        if (isinstance(obj, torch.Tensor)):
            if ('tensors' not in obj_info.keys() or isinstance(obj_info['tensors'], int) == False): 
                obj_info['tensors'] = 0
            mem = obj.element_size()
            for dim in obj.size():
                if(dim > 0):
                    mem *= dim

            obj_info['tensors'] += mem
        else:
            # Other objects that don't need special handling
            if ('other' not in obj_info.keys() or isinstance(obj_info['other'], int) == False): 
                    obj_info['other'] = 0

    # Sum all categories
    obj_info['bytes_total'] = 0
    for key in obj_info.keys():
        obj_info['bytes_total'] += obj_info[key]

    return obj_info

    total_mem = torch.cuda.get_device_properties(device).total_memory
    reserved_mem = torch.cuda.memory_reserved(device)
    allocated_mem = torch.cuda.memory_allocated(device)
    print(f"Total Memory: {total_mem}")
    print(f"Reserved Memory: {reserved_mem}")
    print(f"Allocated Memory: {allocated_mem}")
    print(f"Free Memory: {total_mem-reserved_mem-allocated_mem}")

tensor = torch.rand((1,81,512))
obj_info = memory_report((tensor, msclap_model), device)
print(obj_info)
print(obj_info['bytes_total'] * 1e-9)

{'tensors': 165888, 'other': 0, 'bytes_total': 331776}
0.00033177600000000004


# Extraction
Due to memory constraints that don't allow all embeddings to be extracted at once, a number of folds are created that samples at least once from each class. If no overlap between folds or duplicate samples is unwanted, the total number of folds will be equal to the class with the least amount of samples. Folds are saved to disk so the true class for each embedding can be retrieved.

The 17-fold embeddings use seed '1337'
The 200-fold embeddings use no seed, as it will make every fold the same.

In [3]:
allow_overlap = True
# number of folds. If no overlap is allowed, the smallest class determines the max folds.
max_folds = data['category'].value_counts().min()
if (allow_overlap):
    k_folds = 200
    n_samples = 1   # number of samples per category per fold
    seed = None
else:
    k_folds = max_folds
    n_samples = 1
    seed = 1337
    seed_everything(seed)

folds = []
for k in range(k_folds):
    group = data.groupby('category', group_keys=False)
    fold = group.sample(n=n_samples, replace=allow_overlap, random_state=seed) # Remove seed when replace = True
    # Disable between-fold contamination
    if (allow_overlap == False):
        data.drop(fold.index.to_list(), inplace=True)

    # Used for extracting embeddings
    folds.append(fold) 
    # Save folds to disk for later reference
    fold.to_csv(settings['dataset_drive'] + f"datasets/soundly/embeddings/zs_fold_with_overlap_{k+1}.csv",index=False)

In [4]:
experiment_name = "kfold200"
# Save audio embedding folds to disk.
models = ['laion', 'msclap'] # Used to reuse the model variable to save memory
for model_name in tqdm(models, total=len(models)):
    # Load models
    if model_name == 'msclap':
        print(f"Loading MSCLAP...")
        model = CLAP(version = '2023', use_cuda=True) # version can be 2022 or 2023
        print(f"MSCLAP loaded.")

    if model_name == 'laion': 
        print(f"Loading LAION CLAP...")
        model = laion_clap.CLAP_Module(enable_fusion=False)
        model.load_ckpt() # Load default checkpoint: 630k-audioset-best.pt
        model.to(device)
        print(f"LAION CLAP loaded.")

    for i, fold in tqdm(enumerate(folds), leave=False, total=len(folds)):
        paths = []
        for path in fold['path']:
            paths.append(path.replace(path[:3], settings['dataset_drive'])) # Change the dataset drive
        
        print("Computing audio embeddings...")

        if model_name == 'msclap':
            audio_embedding = torch.tensor(model.get_audio_embeddings(paths, resample=True))
            
        if model_name == 'laion': 
            audio_embedding = model.get_audio_embedding_from_filelist(paths, use_tensor=True)
        
        print(f"Extracted fold {i+1}. Saving to disk...")
        torch.save(audio_embedding, settings['dataset_drive'] + f"datasets/soundly/soundly_80k-audio_embeddings-{experiment_name}-{model_name}-fold_{i+1}.pt")
    

  0%|          | 0/2 [00:00<?, ?it/s]

Loading LAION CLAP...
Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.laye



Computing audio embeddings...




Extracted fold 1. Saving to disk...
Computing audio embeddings...




Extracted fold 2. Saving to disk...
Computing audio embeddings...




Extracted fold 3. Saving to disk...
Computing audio embeddings...




Extracted fold 4. Saving to disk...
Computing audio embeddings...




Extracted fold 5. Saving to disk...
Computing audio embeddings...




Extracted fold 6. Saving to disk...
Computing audio embeddings...




Extracted fold 7. Saving to disk...
Computing audio embeddings...




Extracted fold 8. Saving to disk...
Computing audio embeddings...




Extracted fold 9. Saving to disk...
Computing audio embeddings...




Extracted fold 10. Saving to disk...
Computing audio embeddings...




Extracted fold 11. Saving to disk...
Computing audio embeddings...




Extracted fold 12. Saving to disk...
Computing audio embeddings...




Extracted fold 13. Saving to disk...
Computing audio embeddings...




Extracted fold 14. Saving to disk...
Computing audio embeddings...
Extracted fold 15. Saving to disk...




Computing audio embeddings...




Extracted fold 16. Saving to disk...
Computing audio embeddings...




Extracted fold 17. Saving to disk...
Computing audio embeddings...




Extracted fold 18. Saving to disk...
Computing audio embeddings...





Extracted fold 19. Saving to disk...


 10%|▉         | 19/200 [15:52<2:20:28, 46.57s/it][A

Computing audio embeddings...




Extracted fold 20. Saving to disk...
Computing audio embeddings...




Extracted fold 21. Saving to disk...
Computing audio embeddings...




Extracted fold 22. Saving to disk...
Computing audio embeddings...




Extracted fold 23. Saving to disk...
Computing audio embeddings...




Extracted fold 24. Saving to disk...
Computing audio embeddings...
Extracted fold 25. Saving to disk...




Computing audio embeddings...




Extracted fold 26. Saving to disk...
Computing audio embeddings...




Extracted fold 27. Saving to disk...
Computing audio embeddings...




Extracted fold 28. Saving to disk...
Computing audio embeddings...
Extracted fold 29. Saving to disk...




Computing audio embeddings...




Extracted fold 30. Saving to disk...
Computing audio embeddings...




Extracted fold 31. Saving to disk...
Computing audio embeddings...




Extracted fold 32. Saving to disk...
Computing audio embeddings...




Extracted fold 33. Saving to disk...
Computing audio embeddings...




Extracted fold 34. Saving to disk...
Computing audio embeddings...




Extracted fold 35. Saving to disk...
Computing audio embeddings...




Extracted fold 36. Saving to disk...
Computing audio embeddings...
Extracted fold 37. Saving to disk...




Computing audio embeddings...




Extracted fold 38. Saving to disk...
Computing audio embeddings...




Extracted fold 39. Saving to disk...
Computing audio embeddings...




Extracted fold 40. Saving to disk...
Computing audio embeddings...




Extracted fold 41. Saving to disk...
Computing audio embeddings...




Extracted fold 42. Saving to disk...
Computing audio embeddings...




Extracted fold 43. Saving to disk...
Computing audio embeddings...




Extracted fold 44. Saving to disk...
Computing audio embeddings...




Extracted fold 45. Saving to disk...
Computing audio embeddings...




Extracted fold 46. Saving to disk...
Computing audio embeddings...




Extracted fold 47. Saving to disk...
Computing audio embeddings...




Extracted fold 48. Saving to disk...
Computing audio embeddings...




Extracted fold 49. Saving to disk...
Computing audio embeddings...




Extracted fold 50. Saving to disk...
Computing audio embeddings...




Extracted fold 51. Saving to disk...
Computing audio embeddings...




Extracted fold 52. Saving to disk...
Computing audio embeddings...




Extracted fold 53. Saving to disk...
Computing audio embeddings...




Extracted fold 54. Saving to disk...
Computing audio embeddings...




Extracted fold 55. Saving to disk...
Computing audio embeddings...




Extracted fold 56. Saving to disk...
Computing audio embeddings...




Extracted fold 57. Saving to disk...
Computing audio embeddings...
Extracted fold 58. Saving to disk...




Computing audio embeddings...
Extracted fold 59. Saving to disk...




Computing audio embeddings...




Extracted fold 60. Saving to disk...
Computing audio embeddings...





Extracted fold 61. Saving to disk...


 30%|███       | 61/200 [50:21<1:50:11, 47.57s/it][A

Computing audio embeddings...




Extracted fold 62. Saving to disk...
Computing audio embeddings...




Extracted fold 63. Saving to disk...
Computing audio embeddings...




Extracted fold 64. Saving to disk...
Computing audio embeddings...




Extracted fold 65. Saving to disk...
Computing audio embeddings...




Extracted fold 66. Saving to disk...
Computing audio embeddings...




Extracted fold 67. Saving to disk...
Computing audio embeddings...




Extracted fold 68. Saving to disk...
Computing audio embeddings...




Extracted fold 69. Saving to disk...
Computing audio embeddings...




Extracted fold 70. Saving to disk...
Computing audio embeddings...




Extracted fold 71. Saving to disk...
Computing audio embeddings...




Extracted fold 72. Saving to disk...
Computing audio embeddings...




Extracted fold 73. Saving to disk...
Computing audio embeddings...




Extracted fold 74. Saving to disk...
Computing audio embeddings...




Extracted fold 75. Saving to disk...
Computing audio embeddings...




Extracted fold 76. Saving to disk...
Computing audio embeddings...




Extracted fold 77. Saving to disk...
Computing audio embeddings...




Extracted fold 78. Saving to disk...
Computing audio embeddings...
Extracted fold 79. Saving to disk...




Computing audio embeddings...




Extracted fold 80. Saving to disk...
Computing audio embeddings...




Extracted fold 81. Saving to disk...
Computing audio embeddings...




Extracted fold 82. Saving to disk...
Computing audio embeddings...
Extracted fold 83. Saving to disk...




Computing audio embeddings...




Extracted fold 84. Saving to disk...
Computing audio embeddings...




Extracted fold 85. Saving to disk...
Computing audio embeddings...




Extracted fold 86. Saving to disk...
Computing audio embeddings...




Extracted fold 87. Saving to disk...
Computing audio embeddings...




Extracted fold 88. Saving to disk...
Computing audio embeddings...




Extracted fold 89. Saving to disk...
Computing audio embeddings...
Extracted fold 90. Saving to disk...




Computing audio embeddings...




Extracted fold 91. Saving to disk...
Computing audio embeddings...




Extracted fold 92. Saving to disk...
Computing audio embeddings...




Extracted fold 93. Saving to disk...
Computing audio embeddings...




Extracted fold 94. Saving to disk...
Computing audio embeddings...




Extracted fold 95. Saving to disk...
Computing audio embeddings...




Extracted fold 96. Saving to disk...
Computing audio embeddings...




Extracted fold 97. Saving to disk...
Computing audio embeddings...




Extracted fold 98. Saving to disk...
Computing audio embeddings...




Extracted fold 99. Saving to disk...
Computing audio embeddings...





Extracted fold 100. Saving to disk...


 50%|█████     | 100/200 [1:24:15<1:25:15, 51.15s/it][A

Computing audio embeddings...
Extracted fold 101. Saving to disk...




Computing audio embeddings...




Extracted fold 102. Saving to disk...
Computing audio embeddings...
Extracted fold 103. Saving to disk...




Computing audio embeddings...




Extracted fold 104. Saving to disk...
Computing audio embeddings...




Extracted fold 105. Saving to disk...
Computing audio embeddings...




Extracted fold 106. Saving to disk...
Computing audio embeddings...




Extracted fold 107. Saving to disk...
Computing audio embeddings...




Extracted fold 108. Saving to disk...
Computing audio embeddings...




Extracted fold 109. Saving to disk...
Computing audio embeddings...




Extracted fold 110. Saving to disk...
Computing audio embeddings...




Extracted fold 111. Saving to disk...
Computing audio embeddings...




Extracted fold 112. Saving to disk...
Computing audio embeddings...




Extracted fold 113. Saving to disk...
Computing audio embeddings...




Extracted fold 114. Saving to disk...
Computing audio embeddings...




Extracted fold 115. Saving to disk...
Computing audio embeddings...




Extracted fold 116. Saving to disk...
Computing audio embeddings...




Extracted fold 117. Saving to disk...
Computing audio embeddings...




Extracted fold 118. Saving to disk...
Computing audio embeddings...




Extracted fold 119. Saving to disk...
Computing audio embeddings...




Extracted fold 120. Saving to disk...
Computing audio embeddings...




Extracted fold 121. Saving to disk...
Computing audio embeddings...
Extracted fold 122. Saving to disk...




Computing audio embeddings...




Extracted fold 123. Saving to disk...
Computing audio embeddings...




Extracted fold 124. Saving to disk...
Computing audio embeddings...




Extracted fold 125. Saving to disk...
Computing audio embeddings...




Extracted fold 126. Saving to disk...
Computing audio embeddings...




Extracted fold 127. Saving to disk...
Computing audio embeddings...




Extracted fold 128. Saving to disk...
Computing audio embeddings...




Extracted fold 129. Saving to disk...
Computing audio embeddings...




Extracted fold 130. Saving to disk...
Computing audio embeddings...




Extracted fold 131. Saving to disk...
Computing audio embeddings...




Extracted fold 132. Saving to disk...
Computing audio embeddings...




Extracted fold 133. Saving to disk...
Computing audio embeddings...




Extracted fold 134. Saving to disk...
Computing audio embeddings...




Extracted fold 135. Saving to disk...
Computing audio embeddings...




Extracted fold 136. Saving to disk...
Computing audio embeddings...




Extracted fold 137. Saving to disk...
Computing audio embeddings...




Extracted fold 138. Saving to disk...
Computing audio embeddings...




Extracted fold 139. Saving to disk...
Computing audio embeddings...




Extracted fold 140. Saving to disk...
Computing audio embeddings...




Extracted fold 141. Saving to disk...
Computing audio embeddings...




Extracted fold 142. Saving to disk...
Computing audio embeddings...
Extracted fold 143. Saving to disk...




Computing audio embeddings...




Extracted fold 144. Saving to disk...
Computing audio embeddings...




Extracted fold 145. Saving to disk...
Computing audio embeddings...




Extracted fold 146. Saving to disk...
Computing audio embeddings...




Extracted fold 147. Saving to disk...
Computing audio embeddings...




Extracted fold 148. Saving to disk...
Computing audio embeddings...




Extracted fold 149. Saving to disk...
Computing audio embeddings...




Extracted fold 150. Saving to disk...
Computing audio embeddings...




Extracted fold 151. Saving to disk...
Computing audio embeddings...
Extracted fold 152. Saving to disk...




Computing audio embeddings...




Extracted fold 153. Saving to disk...
Computing audio embeddings...



 77%|███████▋  | 154/200 [2:08:18<37:31, 48.95s/it]

Extracted fold 154. Saving to disk...


[A

Computing audio embeddings...




Extracted fold 155. Saving to disk...
Computing audio embeddings...




Extracted fold 156. Saving to disk...
Computing audio embeddings...
Extracted fold 157. Saving to disk...




Computing audio embeddings...




Extracted fold 158. Saving to disk...
Computing audio embeddings...
Extracted fold 159. Saving to disk...




Computing audio embeddings...




Extracted fold 160. Saving to disk...
Computing audio embeddings...




Extracted fold 161. Saving to disk...
Computing audio embeddings...




Extracted fold 162. Saving to disk...
Computing audio embeddings...




Extracted fold 163. Saving to disk...
Computing audio embeddings...




Extracted fold 164. Saving to disk...
Computing audio embeddings...
Extracted fold 165. Saving to disk...




Computing audio embeddings...




Extracted fold 166. Saving to disk...
Computing audio embeddings...




Extracted fold 167. Saving to disk...
Computing audio embeddings...




Extracted fold 168. Saving to disk...
Computing audio embeddings...




Extracted fold 169. Saving to disk...
Computing audio embeddings...




Extracted fold 170. Saving to disk...
Computing audio embeddings...




Extracted fold 171. Saving to disk...
Computing audio embeddings...




Extracted fold 172. Saving to disk...
Computing audio embeddings...




Extracted fold 173. Saving to disk...
Computing audio embeddings...
Extracted fold 174. Saving to disk...




Computing audio embeddings...
Extracted fold 175. Saving to disk...




Computing audio embeddings...
Extracted fold 176. Saving to disk...




Computing audio embeddings...





Extracted fold 177. Saving to disk...


 88%|████████▊ | 177/200 [2:28:03<19:43, 51.48s/it][A

Computing audio embeddings...




Extracted fold 178. Saving to disk...
Computing audio embeddings...




Extracted fold 179. Saving to disk...
Computing audio embeddings...




Extracted fold 180. Saving to disk...
Computing audio embeddings...




Extracted fold 181. Saving to disk...
Computing audio embeddings...




Extracted fold 182. Saving to disk...
Computing audio embeddings...
Extracted fold 183. Saving to disk...




Computing audio embeddings...




Extracted fold 184. Saving to disk...
Computing audio embeddings...





Extracted fold 185. Saving to disk...


 92%|█████████▎| 185/200 [2:34:21<11:22, 45.52s/it][A

Computing audio embeddings...
Extracted fold 186. Saving to disk...




Computing audio embeddings...




Extracted fold 187. Saving to disk...
Computing audio embeddings...




Extracted fold 188. Saving to disk...
Computing audio embeddings...




Extracted fold 189. Saving to disk...
Computing audio embeddings...




Extracted fold 190. Saving to disk...
Computing audio embeddings...




Extracted fold 191. Saving to disk...
Computing audio embeddings...




Extracted fold 192. Saving to disk...
Computing audio embeddings...




Extracted fold 193. Saving to disk...
Computing audio embeddings...




Extracted fold 194. Saving to disk...
Computing audio embeddings...




Extracted fold 195. Saving to disk...
Computing audio embeddings...




Extracted fold 196. Saving to disk...
Computing audio embeddings...




Extracted fold 197. Saving to disk...
Computing audio embeddings...




Extracted fold 198. Saving to disk...
Computing audio embeddings...




Extracted fold 199. Saving to disk...
Computing audio embeddings...


 50%|█████     | 1/2 [2:46:32<2:46:32, 9992.87s/it]

Extracted fold 200. Saving to disk...
Loading MSCLAP...
MSCLAP loaded.




Computing audio embeddings...




Extracted fold 1. Saving to disk...
Computing audio embeddings...




Extracted fold 2. Saving to disk...
Computing audio embeddings...




Extracted fold 3. Saving to disk...
Computing audio embeddings...




Extracted fold 4. Saving to disk...
Computing audio embeddings...




Extracted fold 5. Saving to disk...
Computing audio embeddings...




Extracted fold 6. Saving to disk...
Computing audio embeddings...
Extracted fold 7. Saving to disk...




Computing audio embeddings...




Extracted fold 8. Saving to disk...
Computing audio embeddings...




Extracted fold 9. Saving to disk...
Computing audio embeddings...




Extracted fold 10. Saving to disk...
Computing audio embeddings...




Extracted fold 11. Saving to disk...
Computing audio embeddings...




Extracted fold 12. Saving to disk...
Computing audio embeddings...




Extracted fold 13. Saving to disk...
Computing audio embeddings...




Extracted fold 14. Saving to disk...
Computing audio embeddings...
Extracted fold 15. Saving to disk...




Computing audio embeddings...




Extracted fold 16. Saving to disk...
Computing audio embeddings...




Extracted fold 17. Saving to disk...
Computing audio embeddings...




Extracted fold 18. Saving to disk...
Computing audio embeddings...




Extracted fold 19. Saving to disk...
Computing audio embeddings...




Extracted fold 20. Saving to disk...
Computing audio embeddings...




Extracted fold 21. Saving to disk...
Computing audio embeddings...
Extracted fold 22. Saving to disk...




Computing audio embeddings...




Extracted fold 23. Saving to disk...
Computing audio embeddings...




Extracted fold 24. Saving to disk...
Computing audio embeddings...




Extracted fold 25. Saving to disk...
Computing audio embeddings...




Extracted fold 26. Saving to disk...
Computing audio embeddings...




Extracted fold 27. Saving to disk...
Computing audio embeddings...
Extracted fold 28. Saving to disk...




Computing audio embeddings...
Extracted fold 29. Saving to disk...




Computing audio embeddings...




Extracted fold 30. Saving to disk...
Computing audio embeddings...




Extracted fold 31. Saving to disk...
Computing audio embeddings...




Extracted fold 32. Saving to disk...
Computing audio embeddings...




Extracted fold 33. Saving to disk...
Computing audio embeddings...




Extracted fold 34. Saving to disk...
Computing audio embeddings...




Extracted fold 35. Saving to disk...
Computing audio embeddings...




Extracted fold 36. Saving to disk...
Computing audio embeddings...




Extracted fold 37. Saving to disk...
Computing audio embeddings...




Extracted fold 38. Saving to disk...
Computing audio embeddings...




Extracted fold 39. Saving to disk...
Computing audio embeddings...




Extracted fold 40. Saving to disk...
Computing audio embeddings...




Extracted fold 41. Saving to disk...
Computing audio embeddings...




Extracted fold 42. Saving to disk...
Computing audio embeddings...




Extracted fold 43. Saving to disk...
Computing audio embeddings...




Extracted fold 44. Saving to disk...
Computing audio embeddings...




Extracted fold 45. Saving to disk...
Computing audio embeddings...




Extracted fold 46. Saving to disk...
Computing audio embeddings...




Extracted fold 47. Saving to disk...
Computing audio embeddings...




Extracted fold 48. Saving to disk...
Computing audio embeddings...




Extracted fold 49. Saving to disk...
Computing audio embeddings...
Extracted fold 50. Saving to disk...




Computing audio embeddings...




Extracted fold 51. Saving to disk...
Computing audio embeddings...




Extracted fold 52. Saving to disk...
Computing audio embeddings...




Extracted fold 53. Saving to disk...
Computing audio embeddings...




Extracted fold 54. Saving to disk...
Computing audio embeddings...
Extracted fold 55. Saving to disk...




Computing audio embeddings...




Extracted fold 56. Saving to disk...
Computing audio embeddings...




Extracted fold 57. Saving to disk...
Computing audio embeddings...
Extracted fold 58. Saving to disk...




Computing audio embeddings...




Extracted fold 59. Saving to disk...
Computing audio embeddings...




Extracted fold 60. Saving to disk...
Computing audio embeddings...




Extracted fold 61. Saving to disk...
Computing audio embeddings...




Extracted fold 62. Saving to disk...
Computing audio embeddings...




Extracted fold 63. Saving to disk...
Computing audio embeddings...




Extracted fold 64. Saving to disk...
Computing audio embeddings...




Extracted fold 65. Saving to disk...
Computing audio embeddings...




Extracted fold 66. Saving to disk...
Computing audio embeddings...




Extracted fold 67. Saving to disk...
Computing audio embeddings...




Extracted fold 68. Saving to disk...
Computing audio embeddings...




Extracted fold 69. Saving to disk...
Computing audio embeddings...




Extracted fold 70. Saving to disk...
Computing audio embeddings...




Extracted fold 71. Saving to disk...
Computing audio embeddings...




Extracted fold 72. Saving to disk...
Computing audio embeddings...




Extracted fold 73. Saving to disk...
Computing audio embeddings...




Extracted fold 74. Saving to disk...
Computing audio embeddings...




Extracted fold 75. Saving to disk...
Computing audio embeddings...




Extracted fold 76. Saving to disk...
Computing audio embeddings...




Extracted fold 77. Saving to disk...
Computing audio embeddings...




Extracted fold 78. Saving to disk...
Computing audio embeddings...




Extracted fold 79. Saving to disk...
Computing audio embeddings...




Extracted fold 80. Saving to disk...
Computing audio embeddings...




Extracted fold 81. Saving to disk...
Computing audio embeddings...




Extracted fold 82. Saving to disk...
Computing audio embeddings...




Extracted fold 83. Saving to disk...
Computing audio embeddings...




Extracted fold 84. Saving to disk...
Computing audio embeddings...




Extracted fold 85. Saving to disk...
Computing audio embeddings...




Extracted fold 86. Saving to disk...
Computing audio embeddings...




Extracted fold 87. Saving to disk...
Computing audio embeddings...
Extracted fold 88. Saving to disk...




Computing audio embeddings...
Extracted fold 89. Saving to disk...




Computing audio embeddings...




Extracted fold 90. Saving to disk...
Computing audio embeddings...




Extracted fold 91. Saving to disk...
Computing audio embeddings...




Extracted fold 92. Saving to disk...
Computing audio embeddings...
Extracted fold 93. Saving to disk...




Computing audio embeddings...




Extracted fold 94. Saving to disk...
Computing audio embeddings...




Extracted fold 95. Saving to disk...
Computing audio embeddings...




Extracted fold 96. Saving to disk...
Computing audio embeddings...




Extracted fold 97. Saving to disk...
Computing audio embeddings...




Extracted fold 98. Saving to disk...
Computing audio embeddings...




Extracted fold 99. Saving to disk...
Computing audio embeddings...
Extracted fold 100. Saving to disk...




Computing audio embeddings...




Extracted fold 101. Saving to disk...
Computing audio embeddings...




Extracted fold 102. Saving to disk...
Computing audio embeddings...




Extracted fold 103. Saving to disk...
Computing audio embeddings...




Extracted fold 104. Saving to disk...
Computing audio embeddings...




Extracted fold 105. Saving to disk...
Computing audio embeddings...




Extracted fold 106. Saving to disk...
Computing audio embeddings...




Extracted fold 107. Saving to disk...
Computing audio embeddings...




Extracted fold 108. Saving to disk...
Computing audio embeddings...




Extracted fold 109. Saving to disk...
Computing audio embeddings...




Extracted fold 110. Saving to disk...
Computing audio embeddings...




Extracted fold 111. Saving to disk...
Computing audio embeddings...




Extracted fold 112. Saving to disk...
Computing audio embeddings...




Extracted fold 113. Saving to disk...
Computing audio embeddings...
Extracted fold 114. Saving to disk...




Computing audio embeddings...




Extracted fold 115. Saving to disk...
Computing audio embeddings...




Extracted fold 116. Saving to disk...
Computing audio embeddings...




Extracted fold 117. Saving to disk...
Computing audio embeddings...




Extracted fold 118. Saving to disk...
Computing audio embeddings...




Extracted fold 119. Saving to disk...
Computing audio embeddings...




Extracted fold 120. Saving to disk...
Computing audio embeddings...




Extracted fold 121. Saving to disk...
Computing audio embeddings...




Extracted fold 122. Saving to disk...
Computing audio embeddings...




Extracted fold 123. Saving to disk...
Computing audio embeddings...




Extracted fold 124. Saving to disk...
Computing audio embeddings...




Extracted fold 125. Saving to disk...
Computing audio embeddings...




Extracted fold 126. Saving to disk...
Computing audio embeddings...




Extracted fold 127. Saving to disk...
Computing audio embeddings...




Extracted fold 128. Saving to disk...
Computing audio embeddings...




Extracted fold 129. Saving to disk...
Computing audio embeddings...




Extracted fold 130. Saving to disk...
Computing audio embeddings...




Extracted fold 131. Saving to disk...
Computing audio embeddings...




Extracted fold 132. Saving to disk...
Computing audio embeddings...
Extracted fold 133. Saving to disk...




Computing audio embeddings...




Extracted fold 134. Saving to disk...
Computing audio embeddings...
Extracted fold 135. Saving to disk...




Computing audio embeddings...




Extracted fold 136. Saving to disk...
Computing audio embeddings...




Extracted fold 137. Saving to disk...
Computing audio embeddings...




Extracted fold 138. Saving to disk...
Computing audio embeddings...




Extracted fold 139. Saving to disk...
Computing audio embeddings...




Extracted fold 140. Saving to disk...
Computing audio embeddings...




Extracted fold 141. Saving to disk...
Computing audio embeddings...




Extracted fold 142. Saving to disk...
Computing audio embeddings...




Extracted fold 143. Saving to disk...
Computing audio embeddings...




Extracted fold 144. Saving to disk...
Computing audio embeddings...




Extracted fold 145. Saving to disk...
Computing audio embeddings...




Extracted fold 146. Saving to disk...
Computing audio embeddings...




Extracted fold 147. Saving to disk...
Computing audio embeddings...




Extracted fold 148. Saving to disk...
Computing audio embeddings...




Extracted fold 149. Saving to disk...
Computing audio embeddings...




Extracted fold 150. Saving to disk...
Computing audio embeddings...




Extracted fold 151. Saving to disk...
Computing audio embeddings...




Extracted fold 152. Saving to disk...
Computing audio embeddings...




Extracted fold 153. Saving to disk...
Computing audio embeddings...




Extracted fold 154. Saving to disk...
Computing audio embeddings...




Extracted fold 155. Saving to disk...
Computing audio embeddings...
Extracted fold 156. Saving to disk...




Computing audio embeddings...
Extracted fold 157. Saving to disk...




Computing audio embeddings...




Extracted fold 158. Saving to disk...
Computing audio embeddings...




Extracted fold 159. Saving to disk...
Computing audio embeddings...




Extracted fold 160. Saving to disk...
Computing audio embeddings...




Extracted fold 161. Saving to disk...
Computing audio embeddings...




Extracted fold 162. Saving to disk...
Computing audio embeddings...





Extracted fold 163. Saving to disk...


 82%|████████▏ | 163/200 [2:12:03<30:03, 48.74s/it][A

Computing audio embeddings...




Extracted fold 164. Saving to disk...
Computing audio embeddings...




Extracted fold 165. Saving to disk...
Computing audio embeddings...




Extracted fold 166. Saving to disk...
Computing audio embeddings...




Extracted fold 167. Saving to disk...
Computing audio embeddings...




Extracted fold 168. Saving to disk...
Computing audio embeddings...




Extracted fold 169. Saving to disk...
Computing audio embeddings...




Extracted fold 170. Saving to disk...
Computing audio embeddings...




Extracted fold 171. Saving to disk...
Computing audio embeddings...




Extracted fold 172. Saving to disk...
Computing audio embeddings...




Extracted fold 173. Saving to disk...
Computing audio embeddings...




Extracted fold 174. Saving to disk...
Computing audio embeddings...




Extracted fold 175. Saving to disk...
Computing audio embeddings...




Extracted fold 176. Saving to disk...
Computing audio embeddings...




Extracted fold 177. Saving to disk...
Computing audio embeddings...
Extracted fold 178. Saving to disk...




Computing audio embeddings...





Extracted fold 179. Saving to disk...


 90%|████████▉ | 179/200 [2:25:49<18:08, 51.85s/it][A

Computing audio embeddings...




Extracted fold 180. Saving to disk...
Computing audio embeddings...




Extracted fold 181. Saving to disk...
Computing audio embeddings...




Extracted fold 182. Saving to disk...
Computing audio embeddings...




Extracted fold 183. Saving to disk...
Computing audio embeddings...




Extracted fold 184. Saving to disk...
Computing audio embeddings...




Extracted fold 185. Saving to disk...
Computing audio embeddings...




Extracted fold 186. Saving to disk...
Computing audio embeddings...




Extracted fold 187. Saving to disk...
Computing audio embeddings...




Extracted fold 188. Saving to disk...
Computing audio embeddings...




Extracted fold 189. Saving to disk...
Computing audio embeddings...




Extracted fold 190. Saving to disk...
Computing audio embeddings...




Extracted fold 191. Saving to disk...
Computing audio embeddings...




Extracted fold 192. Saving to disk...
Computing audio embeddings...




Extracted fold 193. Saving to disk...
Computing audio embeddings...




Extracted fold 194. Saving to disk...
Computing audio embeddings...




Extracted fold 195. Saving to disk...
Computing audio embeddings...




Extracted fold 196. Saving to disk...
Computing audio embeddings...




Extracted fold 197. Saving to disk...
Computing audio embeddings...




Extracted fold 198. Saving to disk...
Computing audio embeddings...
Extracted fold 199. Saving to disk...




Computing audio embeddings...


100%|██████████| 2/2 [5:28:48<00:00, 9864.47s/it]  

Extracted fold 200. Saving to disk...





In [3]:
class NpEncoder(json.JSONEncoder):
    """Source: https://stackoverflow.com/questions/50916422/python-typeerror-object-of-type-int64-is-not-json-serializable"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [24]:
def explore_dataset_ucs(data_path:str='E:/Soundly Library juni 2023/Soundly Pro/', analyze_sample_rate=False):
    """
    Iterates over a UCS dataset.
    """
    files_count_dict = {}
    samples_rates_dict = {}
    channels_dict = {}
    bit_rates_dict = {}
    encodings_dict = {}
    for i, (root, dirnames, filenames) in enumerate(os.walk(data_path, topdown=True)):
        
        if(i == 0):
            all_categories = dirnames

        root = root.replace('\\', '/')
        split_root = root.split('/')

        for category in all_categories:
            if category in split_root:
                if (category in files_count_dict.keys()):
                    files_count_dict[category] += len(filenames)
                else: 
                    files_count_dict[category] = len(filenames)

                if (len(filenames) != 0 and analyze_sample_rate == True):
                    for files in filenames:
                        try:
                            metadata = torchaudio.info(root+'/'+files)
                            if (metadata.sample_rate in samples_rates_dict.keys()):
                                samples_rates_dict[metadata.sample_rate] += 1
                                channels_dict[metadata.num_channels] += 1
                                bit_rates_dict[metadata.bits_per_sample] += 1
                                encodings_dict[metadata.encoding] += 1
                            else:
                                samples_rates_dict[metadata.sample_rate] = 1
                                channels_dict[metadata.num_channels] = 1
                                bit_rates_dict[metadata.bits_per_sample] = 1
                                encodings_dict[metadata.encoding] = 1
                        except:
                            print(f"File {files} could not be opened.")
                        

    return files_count_dict, samples_rates_dict, channels_dict, bit_rates_dict, encodings_dict

data, rates, channels, bit_rates, encodings = explore_dataset_ucs()

File Ambience, Tropical, Amazonas, Night, Crickets, Frogs, Gran Sabana, Venezuela.wav could not be opened.
File .DS_Store could not be opened.
File ._.DS_Store could not be opened.
File .DS_Store could not be opened.
File ._.DS_Store could not be opened.


# Embeddings used for Semantic Search

In [4]:
#df = pd.read_csv("D:/datasets/soundly.csv")
path_list = data['path'].to_list() # Does not include class 'other'
for i in range(len(path_list)):
    path_list[i] = path_list[i].replace("E:", "D:")

In [6]:
batch_size = 32
batches = (len(path_list) // batch_size) + 1

In [7]:
ms_embeddings_path = "D:/datasets/soundly_80k-audio_embeddings-msclap-batch_size_512.pt"

embeddings_generator = msclap_model.get_audio_embeddings_per_batch(path_list, batch_size=512)

start = time.time()
for emb in tqdm(embeddings_generator):
    torch.cuda.empty_cache()

    try:
        disk_embs = torch.load(ms_embeddings_path)
        audio_embedding_msclap = torch.cat((emb, disk_embs.to(device)), dim=0)
        torch.save(torch.tensor(np.array(audio_embedding_msclap.detach().cpu())), ms_embeddings_path)
        del disk_embs
        del audio_embedding_msclap
    except:
        torch.save(torch.tensor(np.array(emb.detach().cpu())), ms_embeddings_path)

end = time.time()
elapsed_time = end - start
print(f"Time Elapsed: {elapsed_time}")

165it [16:44:24, 365.24s/it]

Time Elapsed: 60264.1118016243





In [None]:
audio_embedding_msclap = msclap_model.get_audio_embeddings(path_list[:50], resample=True)
torch_audio_embeddings_msclap = torch.tensor(np.array(audio_embedding_msclap.detach().cpu()))
torch.save(torch_audio_embeddings_msclap, "D:/datasets/soundly_80k-audio_embeddings-msclap.pt")

In [8]:
# Extract Text Embeddings from file names

labels = []
for path in data['path']:
    print(path.split("/")[-1][:-4])
    break

Ambience, Hospital, Hallway, Norwegian Walla, Phone Ringing 02


# Soundly to CSV

Iterate over the library directory and store path references in a .csv file.

In [2]:
# Load the Soundly Library, to be saved as a .csv file.

df = pd.DataFrame()

audio_path ='E:/Soundly Library juni 2023/Soundly Pro' 
#audio_path = 'E:/Soundly Library juni 2023/Soundly Pro/Bells/Handbell'

audio_files = []
metadata = {
    'path' : [],
    'channels': [],
    'samples' : [],
    'sample_rate' : [],
    'bit_depth' : [],
    'category' : [],
    'sub_category' : []
}
for path, dirs, files in os.walk(audio_path):
    if ( False ):
        if(len(dirs) != 0):
            print(dirs)
        if(len(files) != 0):
            print(files)

    for file in files:
        try:
            file_path = path + "/" + file
            file_path = file_path.replace("\\", "/")
            path_split = file_path.split("/")[3:] # Anything before element 4 is static, the base of the file name.

            # Default categories, given if no other category info is found.
            category = 'other'
            sub_category = 'none'

            if (len(path_split) == 3):
                category = path_split[0]
                sub_category = path_split[1]

            data = torchaudio.info(file_path)
            
            metadata['path'].append(file_path)
            metadata['channels'].append(data.num_channels)
            metadata['samples'].append(data.num_frames)
            metadata['sample_rate'].append(data.sample_rate)
            metadata['bit_depth'].append(data.bits_per_sample)
            metadata['category'].append(category.lower())
            metadata['sub_category'].append(sub_category.lower())
        except:
            print(f"File '{file}' could not be opened.")
        

# Exctract Gendered Embeddings

In [26]:
data = pd.read_csv(settings['dataset_drive'] + settings['dataset_path'])
for query in data['path']:
    if ("voice" in query.lower()):
        print('hit')

hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
hit
