In [1]:
# Run this to activate venv for the terminal instance: .venv\Scripts\activate
# NOTE: you will also need the ff files:
#   -> 'class_labels_indices.csv'
#   -> 'Cnn14_mAP=0.431.pth' (these are the model weights to be used) from https://zenodo.org/records/3987831

from pathlib import Path
import numpy as np
import torch
import librosa
from panns_inference import AudioTagging

# Make sure cuda (gpu) is active!
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cuda


In [2]:
proc_out_32kHz_dir = Path("proc_out_32kHz")
emb_out_dir = Path("embeddings_out") # 2048-d vectors go here
emb_out_dir.mkdir(parents=True, exist_ok=True)

at_model = AudioTagging(checkpoint_path=None, device=device) #this is the pretrained CNN14

wav_files = sorted(proc_out_32kHz_dir.glob("*_32k.wav"))
print(f"{len(wav_files)} WAV files found!")

for wav_path in wav_files:
    print(f"\nProcessing: {wav_path.name}")
    wav, sr = librosa.load(str(wav_path), sr=32000, mono=True) # just to make sure wav is 32kHz
    audio_batch = np.expand_dims(wav, axis=0) # matches the expected shape of PANN

    _, embedding = at_model.inference(audio_batch) # gets the embedding as numpy array

    embedding_vec = embedding[0] # first element of embedding array

    out_path = emb_out_dir / (wav_path.stem + "_embedding2048.npy")
    np.save(str(out_path), embedding_vec)
    print("Embedding saved: ", out_path)

    print(embedding_vec) # if you want to see the vector
    print(embedding_vec.shape)

Checkpoint path: C:\Users\mkyod/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1
6 WAV files found!

Processing: trend3vid6_32k.wav
Embedding saved:  embeddings_out\trend3vid6_32k_embedding2048.npy
[0.         0.         0.         ... 0.19923183 0.46746948 0.        ]
(2048,)

Processing: trend3vid7_32k.wav
Embedding saved:  embeddings_out\trend3vid7_32k_embedding2048.npy
[0.        0.        0.        ... 0.        0.8063234 0.       ]
(2048,)

Processing: trend3vid8_32k.wav
Embedding saved:  embeddings_out\trend3vid8_32k_embedding2048.npy
[0.         0.         0.         ... 0.5110518  0.74246013 0.        ]
(2048,)

Processing: trend5vid2_32k.wav
Embedding saved:  embeddings_out\trend5vid2_32k_embedding2048.npy
[0.         0.         0.         ... 0.19942343 0.04068661 0.        ]
(2048,)

Processing: trend5vid3_32k.wav
Embedding saved:  embeddings_out\trend5vid3_32k_embedding2048.npy
[0.         0.         0.         ... 0.22457194 0.02149612 0.        ]
(2048,)

Processing: trend5v