In [1]:
import torch
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

# Import from your codebase
from CLAPWrapper import CLAPWrapper
from datasets.esc50 import ESC50

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cpu


In [2]:
print("\n" + "="*80)
print("Loading ESC50 Dataset")
print("="*80)

root_path = "./data"
dataset = ESC50(root=root_path, download=True)
print(f"‚úÖ Dataset loaded: {len(dataset)} samples")
print(f"   Classes: {len(dataset.classes)} categories")
print(f"   Sample classes: {dataset.classes[:5]}")

# Prepare text prompts
prompt = 'this is the sound of '
text_labels = [prompt + x for x in dataset.classes]
print(f"\nüìù Text prompts: {len(text_labels)} classes")


Loading ESC50 Dataset
Loading audio files


2000it [00:00, 18674.13it/s]

‚úÖ Dataset loaded: 2000 samples
   Classes: 50 categories
   Sample classes: ['airplane', 'breathing', 'brushing teeth', 'can opening', 'car horn']

üìù Text prompts: 50 classes





In [3]:
# ============================================================================
# STEP 3: Initialize Models
# ============================================================================
print("\n" + "="*80)
print("Initializing Models")
print("="*80)

# Residual config con pc_weights = 1.0 (identit√†)
residual_config = {
    'n_components_ratio': .1,
    'reweight_factor': 2.0,
    'target_layers': [0, 1, 2, 3],  # Layers dove applicare reweighting
    'analysis_mode': True
}

print("\nüîß Loading CLAP Standard...")
clap_standard = CLAPWrapper(
    version='2023',  # or '2022'
    use_cuda=torch.cuda.is_available(),
    type='classic'
)

print("\nüîß Loading ResiDualCLAP...")
clap_residual = CLAPWrapper(
    version='2023',
    use_cuda=torch.cuda.is_available(),
    type='residual',
    residual_config=residual_config
)


Initializing Models

üîß Loading CLAP Standard...

üîß Loading ResiDualCLAP...
[2, 2, 6, 2]
üîç Detecting layer dimensions...
torch.Size([1, 1, 256, 256]) 1.5)
torch.Size([1, 4096, 96]) 2)
torch.Size([1, 4096, 96]) 3)
torch.Size([1, 1024, 192]) 4)
torch.Size([1, 256, 384]) 5)
torch.Size([1, 64, 768]) 6)
torch.Size([1, 64, 768]) 7)
  ‚úì 0: torch.Size([1, 1024, 192])
  ‚úì 1: torch.Size([1, 256, 384])
  ‚úì 2: torch.Size([1, 64, 768])
  ‚úì 3: torch.Size([1, 64, 768])
  ‚úì layer_0: 192D ‚Üí 19 PCs
  ‚úì layer_1: 384D ‚Üí 38 PCs
  ‚úì layer_2: 768D ‚Üí 76 PCs
  ‚úì layer_3: 768D ‚Üí 76 PCs


In [4]:
# Prepare audio samples for PCA fitting
print("Collecting samples for PCA fitting (max 200 samples)...", end='')

# Create a simple dataloader wrapper per PCA fitting
class SimpleAudioDataset:
    def __init__(self, wrapper, esc50_dataset, max_samples=1000):
        self.wrapper = wrapper
        self.audio_paths = []
        for i in range(min(max_samples, len(esc50_dataset))):
            audio_path, _, _ = esc50_dataset[i]
            self.audio_paths.append(audio_path)
    
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        audio_tensor = self.wrapper.load_audio_into_tensor(
            self.audio_paths[idx],
            self.wrapper.args.duration,
            resample=True
        )
        # ‚úÖ Assicurati sia 1D
        if audio_tensor.dim() > 1:
            audio_tensor = audio_tensor.squeeze()
        
        return audio_tensor

# Create dataset and loader
pca_dataset = SimpleAudioDataset(clap_residual, dataset, max_samples=50)
pca_loader = DataLoader(
    pca_dataset, 
    batch_size=16, 
    shuffle=False,
    num_workers=0,  # Start with 0 for debugging
    pin_memory=False
)
print("OK")

Collecting samples for PCA fitting (max 200 samples)...OK


In [5]:
pca_dataset[0].shape

torch.Size([308700])

In [6]:
list(pca_loader)[1].shape

torch.Size([16, 308700])

In [7]:
len(dataset)

2000

In [8]:
print("\n" + "="*80)
print("Fitting PCA Components")
print("="*80)

# Fit PCA
print(f"Fitting PCA on {len(pca_dataset)} samples...")
variance_ratios = clap_residual.clap.audio_encoder.base.htsat.fit_spectral_layers(
    pca_loader,
    max_samples=50
)

print("\nüìä PCA Variance Ratios:")
for layer_name, ratios in variance_ratios.items():
    print(f"   {layer_name}: Top 5 components = {ratios[:5]}")


Fitting PCA Components
Fitting PCA on 50 samples...

üîç PHASE 1: Collecting Hidden States from HTSAT Layers
Target layers: ['layer_0', 'layer_1', 'layer_2', 'layer_3']
Max samples to collect: 50
Batches in dataloader: 4


Collecting samples:   0%|                                                  | 0/4 [00:00<?, ?batch/s]

torch.Size([16, 1, 256, 256]) 1)
torch.Size([16, 1, 256, 256]) 1.5)
torch.Size([16, 4096, 96]) 2)
torch.Size([16, 4096, 96]) 3)
torch.Size([16, 1024, 192]) 4)
torch.Size([16, 256, 384]) 5)
torch.Size([16, 64, 768]) 6)


Collecting samples:   0%|                                                  | 0/4 [00:03<?, ?batch/s]

torch.Size([16, 64, 768]) 7)
torch.Size([16, 1024, 192])

üìä PCA Variance Ratios:





AttributeError: 'int' object has no attribute 'items'

In [None]:
# Dopo aver fatto la collection
for layer_name, outputs in collected_outputs.items():
    if outputs:
        X = torch.cat(outputs, dim=0)
        print(f"\n{'='*60}")
        print(f"Checking {layer_name}")
        quick_rank_check(X)

NameError: name 'collected_outputs' is not defined

In [None]:
# Get text embeddings ONCE for all classes
text_embeddings = clap_standard.get_text_embeddings(text_labels)
print(f"   Text embeddings shape: {text_embeddings.shape}")

# Test on subset first (use full dataset later)
test_size = 100  # Start with 200 samples for speed
print(f"\nüìä Testing on {test_size} samples...")

y_preds_baseline, y_labels = [], []

for i in tqdm(range(test_size), desc="Baseline"):
    # Get audio file path and label
    audio_path, target, one_hot_target = dataset[-(i+1)]
    
    # Get audio embedding
    audio_embedding = clap_standard.get_audio_embeddings([audio_path], resample=True)
    
    # Compute similarity
    similarity = clap_standard.compute_similarity(audio_embedding, text_embeddings)
    
    # Get prediction
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds_baseline.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())

# Calculate accuracy
y_labels_array = np.concatenate(y_labels, axis=0)
y_preds_baseline_array = np.concatenate(y_preds_baseline, axis=0)

baseline_acc = accuracy_score(
    np.argmax(y_labels_array, axis=1), 
    np.argmax(y_preds_baseline_array, axis=1)
)

print(f"\n‚úÖ Baseline Accuracy: {baseline_acc:.3f} ({baseline_acc*100:.1f}%)")

   Text embeddings shape: torch.Size([50, 1024])

üìä Testing on 100 samples...


Baseline:   0%|          | 0/100 [00:00<?, ?it/s]


‚úÖ Baseline Accuracy: 0.920 (92.0%)


In [None]:
# Get text embeddings ONCE for all classes
text_embeddings = clap_residual.get_text_embeddings(text_labels)
print(f"   Text embeddings shape: {text_embeddings.shape}")

# Test on subset first (use full dataset later)
test_size = 100  # Start with 200 samples for speed
print(f"\nüìä Testing on {test_size} samples...")

y_preds_residual, y_labels = [], []

for i in tqdm(range(test_size), desc="residual"):
    # Get audio file path and label
    audio_path, target, one_hot_target = dataset[-(i+1)]
    
    # Get audio embedding
    audio_embedding = clap_residual.get_audio_embeddings([audio_path], resample=True)
    
    # Compute similarity
    similarity = clap_residual.compute_similarity(audio_embedding, text_embeddings)
    
    # Get prediction
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds_residual.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())

# Calculate accuracy
y_labels_array = np.concatenate(y_labels, axis=0)
y_preds_residual_array = np.concatenate(y_preds_residual, axis=0)

baseline_acc = accuracy_score(
    np.argmax(y_labels_array, axis=1), 
    np.argmax(y_preds_baseline_array, axis=1)
)

print(f"\n‚úÖ Residual Accuracy: {baseline_acc:.3f} ({baseline_acc*100:.1f}%)")

   Text embeddings shape: torch.Size([50, 1024])

üìä Testing on 100 samples...


residual:   0%|          | 0/100 [00:00<?, ?it/s]


‚úÖ Residual Accuracy: 0.920 (92.0%)


In [None]:
audio_path, target, one_hot_target = dataset[0]

In [None]:
clap_residual.get_audio_embeddings([audio_path], resample=True)

tensor([[ 0.8840,  0.3185, -0.6708,  ...,  2.1432,  1.0298, -0.1217]])

In [None]:
clap_standard.get_audio_embeddings([audio_path], resample=True)

tensor([[ 0.7623,  0.2343, -0.5101,  ...,  1.8940,  0.9414, -0.0119]])