In [15]:
!pip install panns-inference soundfile librosa

Collecting panns-inference
  Downloading panns_inference-0.1.1-py3-none-any.whl.metadata (2.4 kB)
Collecting torchlibrosa (from panns-inference)
  Downloading torchlibrosa-0.1.0-py3-none-any.whl.metadata (3.5 kB)
Downloading panns_inference-0.1.1-py3-none-any.whl (8.3 kB)
Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Installing collected packages: torchlibrosa, panns-inference
Successfully installed panns-inference-0.1.1 torchlibrosa-0.1.0


In [19]:
import os
from glob import glob

import numpy as np
import soundfile as sf
from scipy.signal import resample_poly
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import torch

from panns_inference import AudioTagging

# ─── CONFIG ─────────────────────────────────────────────────────────
TRAIN_DIR = '/kaggle/input/dcase-aml/dev_data/dev_data/slider/train'
TEST_DIR  = '/kaggle/input/dcase-aml/dev_data/dev_data/slider/test'
TARGET_SR = 32000

# ─── DEVICE & PANNs ─────────────────────────────────────────────────
device = 'cuda' if torch.cuda.is_available() else 'cpu'
at = AudioTagging(checkpoint_path=None, device=device)

def extract_embedding(wav_path, target_sr=TARGET_SR):
    """Legge un wav, lo converte in mono, lo risampia e restituisce l'embedding 1-D."""
    wav, sr = sf.read(wav_path)
    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    if sr != target_sr:
        wav = resample_poly(wav, target_sr, sr)

    # numpy → torch, batch dim, device
    wav_tensor = torch.from_numpy(wav.astype(np.float32)) \
                       .unsqueeze(0) \
                       .to(device)            # (1, length)
    _, embedding = at.inference(wav_tensor)  # embedding: numpy array shape (1, 2048)
    return embedding.squeeze(0)             # ora shape (2048,)
                                   # shape (2048,)

# ─── RACCOLTA PATH E GT ────────────────────────────────────────────
train_paths = sorted(glob(os.path.join(TRAIN_DIR, '*.wav')))
test_paths  = sorted(glob(os.path.join(TEST_DIR,  '*.wav')))

if not train_paths:
    raise FileNotFoundError(f'Cartella di training vuota: {TRAIN_DIR}')
if not test_paths:
    raise FileNotFoundError(f'Cartella di test vuota:    {TEST_DIR}')

y_true = np.array([
    os.path.basename(p).lower().startswith('anomaly')
    for p in test_paths
], dtype=int)

# ─── 1) EMBEDDING MEDIO SUI NORMALI ────────────────────────────────
embeds_normal = []
for p in tqdm(train_paths, desc='Estrazione embedding TRAIN'):
    embeds_normal.append(extract_embedding(p))
embeds_normal = np.stack(embeds_normal, axis=0)              # (num_train, 2048)
mu = embeds_normal.mean(axis=0)                             # (2048,)
# ─── 1bis) Calcolo matrice di covarianza ed inversa regolarizzata ──
# embeds_normal: array shape (num_train, D)
cov = np.cov(embeds_normal, rowvar=False)                # (D, D)
reg = 1e-6                                              # piccolo termine per la stabilità numerica
inv_cov = np.linalg.inv(cov + reg * np.eye(cov.shape[0]))

# ─── 2) SCORE DI ANOMALIA SUL TEST (Mahalanobis) ────────────────
scores_maha = []
for p in tqdm(test_paths, desc='Estrazione embedding TEST'):
    emb = extract_embedding(p)                           # shape (D,)
    diff = emb - mu                                      # shape (D,)
    # d_Mahalanobis = sqrt(diff^T · inv_cov · diff)
    score = np.sqrt(diff.dot(inv_cov).dot(diff))
    scores_maha.append(score)
scores_maha = np.array(scores_maha)

# ─── 3) ROC‐AUC ────────────────────────────────────────────────────
roc_auc_maha = roc_auc_score(y_true, scores_maha)
print(f'ROC-AUC (Mahalanobis): {roc_auc_maha:.4f}')

Checkpoint path: /root/panns_data/Cnn14_mAP=0.431.pth
GPU number: 1


Estrazione embedding TRAIN: 100%|██████████| 2370/2370 [00:57<00:00, 40.99it/s]
Estrazione embedding TEST: 100%|██████████| 1101/1101 [00:29<00:00, 36.92it/s]

ROC-AUC (Mahalanobis): 0.9311



