In [1]:
# STEP 1: Setup
# ============================================================================
import sys
from pathlib import Path

import torch
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score

# Import from your codebase
from CLAPWrapper import CLAPWrapper
from datasets.esc50 import ESC50  # Adjust import path if needed

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cpu


In [2]:
print("\n" + "="*50)
print("Loading ESC50 Dataset")
print("="*50)

root_path = "./data"
dataset = ESC50(root=root_path, download=True)

print(f"‚úÖ Dataset loaded: {len(dataset)} samples")
print(f"   Classes: {len(dataset.classes)} categories")
print(f"   Sample classes: {dataset.classes[:5]}")


Loading ESC50 Dataset
Loading audio files


0it [00:00, ?it/s]

2000it [00:00, 16865.59it/s]

‚úÖ Dataset loaded: 2000 samples
   Classes: 50 categories
   Sample classes: ['airplane', 'breathing', 'brushing teeth', 'can opening', 'car horn']





In [3]:
prompt = 'this is the sound of '
text_labels = [prompt + x for x in dataset.classes]
print(f"\nüìù Text prompts prepared: {len(text_labels)} classes")
print(f"   Examples: {text_labels[:3]}")


üìù Text prompts prepared: 50 classes
   Examples: ['this is the sound of airplane', 'this is the sound of breathing', 'this is the sound of brushing teeth']


In [5]:
print("\n" + "="*50)
print("Testing BASELINE CLAP")
print("="*50)

# Initialize BASELINE using CLAPWrapper
baseline_wrapper = CLAPWrapper(version='2023', use_cuda=torch.cuda.is_available())
print("‚úÖ Baseline CLAP loaded")

# Get text embeddings ONCE for all classes
text_embeddings = baseline_wrapper.get_text_embeddings(text_labels)
print(f"   Text embeddings shape: {text_embeddings.shape}")

# Test on subset first (use full dataset later)
test_size = 200  # Start with 200 samples for speed
print(f"\nüìä Testing on {test_size} samples...")

y_preds_baseline, y_labels = [], []

for i in tqdm(range(test_size), desc="Baseline"):
    # Get audio file path and label
    audio_path, target, one_hot_target = dataset[i]
    
    # Get audio embedding
    audio_embedding = baseline_wrapper.get_audio_embeddings([audio_path], resample=True)
    
    # Compute similarity
    similarity = baseline_wrapper.compute_similarity(audio_embedding, text_embeddings)
    
    # Get prediction
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds_baseline.append(y_pred)
    y_labels.append(one_hot_target.detach().cpu().numpy())

# Calculate accuracy
y_labels_array = np.concatenate(y_labels, axis=0)
y_preds_baseline_array = np.concatenate(y_preds_baseline, axis=0)

baseline_acc = accuracy_score(
    np.argmax(y_labels_array, axis=1), 
    np.argmax(y_preds_baseline_array, axis=1)
)

print(f"\n‚úÖ Baseline Accuracy: {baseline_acc:.3f} ({baseline_acc*100:.1f}%)")


Testing BASELINE CLAP
‚úÖ Baseline CLAP loaded
   Text embeddings shape: torch.Size([50, 1024])

üìä Testing on 200 samples...


Baseline:   0%|          | 0/200 [00:00<?, ?it/s]


‚úÖ Baseline Accuracy: 0.940 (94.0%)


In [6]:
print("\n" + "="*50)
print("Setting up RESIDUAL CLAP")
print("="*50)

# Create residual config
residual_config = {
    'target_layers': [1, 3],        # Which transformer layers to modify
    'n_components_ratio': 1.0,     # Use 25% of dims as PCA components
    'reweight_factor': 1.0          # Amplification factor
}

# Initialize RESIDUAL CLAP wrapper (we need to modify CLAPWrapper)
# For now, we'll load the model directly
from models.residual_clap import ResiDualCLAP

# Load config from baseline
args = baseline_wrapper.args

# Create ResiDual model
residual_model = ResiDualCLAP(
    audioenc_name=args.audioenc_name,
    sample_rate=args.sampling_rate,
    window_size=args.window_size,
    hop_size=args.hop_size,
    mel_bins=args.mel_bins,
    fmin=args.fmin,
    fmax=args.fmax,
    classes_num=args.num_classes,
    out_emb=args.out_emb,
    text_model=args.text_model,
    transformer_embed_dim=args.transformer_embed_dim,
    d_proj=args.d_proj,
    residual_config=residual_config
)

# Load pretrained weights from baseline
residual_model.load_state_dict(baseline_wrapper.clap.state_dict(), strict=False)

if torch.cuda.is_available():
    residual_model = residual_model.cuda()

residual_model.eval()
print("‚úÖ ResiDual model initialized")


Setting up RESIDUAL CLAP
[2, 2, 6, 2]
üîç Detecting layer dimensions...
  ‚úì 0: torch.Size([1, 1024, 192])
  ‚úì 1: torch.Size([1, 256, 384])
  ‚úì 2: torch.Size([1, 64, 768])
  ‚úì 3: torch.Size([1, 64, 768])
  ‚úì layer_1: 384D ‚Üí 384 PCs
  ‚úì layer_3: 768D ‚Üí 768 PCs
‚úÖ ResiDual model initialized


In [7]:
print("\n" + "="*50)
print("üîç DIMENSION CHECK")
print("="*50)

# Test audio
test_audio = torch.randn(2, baseline_wrapper.args.sampling_rate * baseline_wrapper.args.duration)
if torch.cuda.is_available():
    test_audio = test_audio.cuda()

print("\n1Ô∏è‚É£ Testing BASELINE:")
with torch.no_grad():
    baseline_emb, baseline_cls = baseline_wrapper.clap.audio_encoder(test_audio)
    print(f"   Audio embedding shape: {baseline_emb.shape}")
    print(f"   Classification output: {baseline_cls.shape if baseline_cls is not None else 'None'}")

print("\n2Ô∏è‚É£ Testing RESIDUAL:")
with torch.no_grad():
    residual_emb, residual_cls = residual_model.audio_encoder(test_audio)
    print(f"   Audio embedding shape: {residual_emb.shape}")
    print(f"   Classification output: {residual_cls.shape if residual_cls is not None else 'None'}")

print("\n3Ô∏è‚É£ Comparison:")
if baseline_emb.shape == residual_emb.shape:
    print("   ‚úÖ Embedding dimensions MATCH!")
    
    # Cosine similarity (dovrebbe essere alta se i pesi sono caricati correttamente)
    cos_sim = F.cosine_similarity(baseline_emb, residual_emb, dim=1).mean().item()
    print(f"   Cosine similarity: {cos_sim:.4f}")
    
    if cos_sim > 0.9:
        print("   ‚úÖ Embeddings are very similar - weights loaded correctly!")
    elif cos_sim > 0.5:
        print("   ‚ö†Ô∏è  Embeddings somewhat similar - some weights might be missing")
    else:
        print("   ‚ùå Embeddings very different - weight loading failed!")
else:
    print(f"   ‚ùå DIMENSION MISMATCH!")
    print(f"      Baseline: {baseline_emb.shape}")
    print(f"      ResiDual: {residual_emb.shape}")
    print("   üõë STOP - Fix this before continuing!")


üîç DIMENSION CHECK

1Ô∏è‚É£ Testing BASELINE:
   Audio embedding shape: torch.Size([2, 1024])
   Classification output: torch.Size([2, 527])

2Ô∏è‚É£ Testing RESIDUAL:
   Audio embedding shape: torch.Size([2, 1024])
   Classification output: torch.Size([2, 527])

3Ô∏è‚É£ Comparison:
   ‚úÖ Embedding dimensions MATCH!
   Cosine similarity: 1.0000
   ‚úÖ Embeddings are very similar - weights loaded correctly!


In [7]:
print("\n" + "="*50)
print("Fitting PCA Components")
print("="*50)

# We need to create a simple dataloader for fitting
fit_size = 100  # Use 100 samples for PCA fitting
print(f"Using {fit_size} samples for PCA fitting...")

class FitDataLoader:
    """Simple dataloader for PCA fitting"""
    def __init__(self, dataset, indices, wrapper):
        self.dataset = dataset
        self.indices = indices
        self.wrapper = wrapper
    
    def __iter__(self):
        for idx in self.indices:
            audio_path, _, _ = self.dataset[idx]
            # Load audio using wrapper's method
            audio_tensor = self.wrapper.load_audio_into_tensor(
                audio_path, 
                self.wrapper.args.duration, 
                resample=True
            )
            # Reshape to match expected format
            audio_tensor = audio_tensor.reshape(1, -1)
            if torch.cuda.is_available():
                audio_tensor = audio_tensor.cuda()
            yield {'audio': audio_tensor}
    
    def __len__(self):
        return len(self.indices)

fit_loader = FitDataLoader(dataset, list(range(fit_size)), baseline_wrapper)

# Fit spectral components
variance_ratios = residual_model.fit_spectral_components(fit_loader, max_samples=fit_size)

print("‚úÖ PCA fitted!")
for layer_name, ratios in variance_ratios.items():
    print(f"   {layer_name}: top 5 variance ratios = {ratios[:5]}")


Fitting PCA Components
Using 100 samples for PCA fitting...
Collecting head outputs for PCA fitting...
Fitting PCA for layer_1 with 25600 samples...
layer_1: Top 5 PC variance ratios: [0.44309655 0.08553774 0.04176068 0.03511701 0.02265158]
Fitting PCA for layer_3 with 6400 samples...
layer_3: Top 5 PC variance ratios: [0.32341182 0.0953413  0.04937391 0.03558723 0.0263464 ]
‚úÖ PCA fitted!
   layer_1: top 5 variance ratios = [0.44309655 0.08553774 0.04176068 0.03511701 0.02265158]
   layer_3: top 5 variance ratios = [0.32341182 0.0953413  0.04937391 0.03558723 0.0263464 ]


In [7]:
# DEBUG: Verifica dimensioni dei spectral layers
print("\nüîç Debug: Checking spectral layer dimensions")
for layer_name, spectral_layer in residual_model.audio_base.spectral_layers.items():
    print(f"{layer_name}:")
    print(f"  embed_dim configurato: {spectral_layer.embed_dim}")
    print(f"  n_components: {spectral_layer.n_components}")
    print(f"  pca_mean shape: {spectral_layer.pca_mean.shape}")
    print(f"  pca_components shape: {spectral_layer.pca_components.shape}")
    print(f"  is_fitted: {spectral_layer.is_fitted.item()}")


üîç Debug: Checking spectral layer dimensions
layer_1:
  embed_dim configurato: 384
  n_components: 96
  pca_mean shape: torch.Size([384])
  pca_components shape: torch.Size([384, 96])
  is_fitted: True
layer_3:
  embed_dim configurato: 768
  n_components: 192
  pca_mean shape: torch.Size([768])
  pca_components shape: torch.Size([768, 192])
  is_fitted: True


In [8]:
print("\n" + "="*50)
print("Testing RESIDUAL CLAP")
print("="*50)

# Get text embeddings (using baseline's text encoder - it's the same)
text_embeddings_res = baseline_wrapper.get_text_embeddings(text_labels)

# Test on same samples
y_preds_residual = []

for i in tqdm(range(test_size), desc="ResiDual"):
    # Get audio file path
    audio_path, _, _ = dataset[i]
    
    # Load and process audio
    audio_tensor = baseline_wrapper.load_audio_into_tensor(
        audio_path, 
        baseline_wrapper.args.duration, 
        resample=True
    )
    audio_tensor = audio_tensor.reshape(1, -1)
    
    if torch.cuda.is_available():
        audio_tensor = audio_tensor.cuda()
    
    # Get embedding using ResiDual model
    with torch.no_grad():
        audio_embedding, _ = residual_model.audio_encoder(audio_tensor)
        
        # Normalize
        audio_embedding = audio_embedding / torch.norm(audio_embedding, dim=-1, keepdim=True)
        text_embeddings_norm = text_embeddings_res / torch.norm(text_embeddings_res, dim=-1, keepdim=True)
        
        # Compute similarity
        similarity = torch.matmul(audio_embedding, text_embeddings_norm.T)
    
    # Get prediction
    y_pred = F.softmax(similarity.detach().cpu(), dim=1).numpy()
    y_preds_residual.append(y_pred)

# Calculate accuracy
y_preds_residual_array = np.concatenate(y_preds_residual, axis=0)

residual_acc = accuracy_score(
    np.argmax(y_labels_array, axis=1), 
    np.argmax(y_preds_residual_array, axis=1)
)

print(f"\n‚úÖ ResiDual Accuracy: {residual_acc:.3f} ({residual_acc*100:.1f}%)")


Testing RESIDUAL CLAP


ResiDual:   0%|          | 0/200 [00:00<?, ?it/s]


‚úÖ ResiDual Accuracy: 0.005 (0.5%)


In [9]:
# ============================================================================
# STEP 8: Compare Results
# ============================================================================
print("\n" + "="*50)
print("RESULTS COMPARISON")
print("="*50)

improvement = ((residual_acc - baseline_acc) / baseline_acc) * 100 if baseline_acc > 0 else 0

print(f"\nüìä Tested on {test_size} samples from ESC50")
print(f"\nAccuracy:")
print(f"  Baseline:    {baseline_acc:.3f} ({baseline_acc*100:.1f}%)")
print(f"  ResiDual:    {residual_acc:.3f} ({residual_acc*100:.1f}%)")
print(f"  Improvement: {improvement:+.2f}%")

# Analyze confusion
baseline_correct = np.argmax(y_labels_array, axis=1) == np.argmax(y_preds_baseline_array, axis=1)
residual_correct = np.argmax(y_labels_array, axis=1) == np.argmax(y_preds_residual_array, axis=1)

newly_correct = np.sum(~baseline_correct & residual_correct)
newly_wrong = np.sum(baseline_correct & ~residual_correct)

print(f"\nüîç Detailed Analysis:")
print(f"  Samples corrected by ResiDual: {newly_correct}")
print(f"  Samples broken by ResiDual:    {newly_wrong}")
print(f"  Net improvement:               {newly_correct - newly_wrong}")

if improvement > 5:
    print("\nüéØ Excellent! ResiDual significantly improves performance")
elif improvement > 0:
    print("\n‚úÖ Good! ResiDual shows improvement")
elif improvement > -2:
    print("\n‚ûñ Marginal difference")
else:
    print("\n‚ö†Ô∏è  Performance decreased - try adjusting config")


RESULTS COMPARISON

üìä Tested on 200 samples from ESC50

Accuracy:
  Baseline:    0.940 (94.0%)
  ResiDual:    0.005 (0.5%)
  Improvement: -99.47%

üîç Detailed Analysis:
  Samples corrected by ResiDual: 0
  Samples broken by ResiDual:    187
  Net improvement:               -187

‚ö†Ô∏è  Performance decreased - try adjusting config


In [None]:
# ============================================================================
# STEP 9: Visualize Results
# ============================================================================
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Plot 1: Accuracy comparison
ax1 = axes[0]
methods = ['Baseline\nCLAP', 'ResiDual\nCLAP']
accuracies = [baseline_acc, residual_acc]
colors = ['#3498db', '#2ecc71']
bars = ax1.bar(methods, accuracies, color=colors, alpha=0.7, width=0.6)
ax1.set_ylabel('Accuracy', fontsize=12)
ax1.set_title('ESC50 Classification Accuracy', fontsize=14, fontweight='bold')
ax1.set_ylim([0, 1.0])
ax1.axhline(y=baseline_acc, color='gray', linestyle='--', alpha=0.3)

for bar, val in zip(bars, accuracies):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02,
             f'{val:.3f}\n({val*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')

# Plot 2: Prediction confidence distribution
ax2 = axes[1]
baseline_confidences = np.max(y_preds_baseline_array, axis=1)
residual_confidences = np.max(y_preds_residual_array, axis=1)

ax2.hist(baseline_confidences, alpha=0.5, bins=20, color='#3498db', label='Baseline')
ax2.hist(residual_confidences, alpha=0.5, bins=20, color='#2ecc71', label='ResiDual')
ax2.set_xlabel('Prediction Confidence', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)
ax2.set_title('Confidence Distribution', fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(True, alpha=0.3)

# Plot 3: Error analysis
ax3 = axes[2]
categories = ['Baseline\nCorrect', 'ResiDual\nCorrect', 'Newly\nCorrected']
values = [
    np.sum(baseline_correct),
    np.sum(residual_correct),
    newly_correct
]
colors_bar = ['#3498db', '#2ecc71', '#27ae60']
bars = ax3.bar(categories, values, color=colors_bar, alpha=0.7)
ax3.set_ylabel('Number of Samples', fontsize=12)
ax3.set_title('Error Analysis', fontsize=14, fontweight='bold')
ax3.axhline(y=test_size, color='gray', linestyle='--', alpha=0.3, label=f'Total ({test_size})')

for bar, val in zip(bars, values):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + 2,
             f'{val}',
             ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚úÖ Test complete!")

In [None]:
# ============================================================================
# OPTIONAL: Test on Full Dataset
# ============================================================================
print("\n" + "="*50)
print("OPTIONAL: Full Dataset Test")
print("="*50)
print(f"To test on full ESC50 dataset ({len(dataset)} samples),")
print(f"change 'test_size = {test_size}' to 'test_size = len(dataset)'")
print(f"and re-run from STEP 4.")
print(f"\nNote: Full dataset will take ~10-15 minutes")