Extracts embeddings from YAMNet's penultimate layer for training multiple classifiers

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel('ERROR')

In [None]:
# Configuration
PROCESSED_ROOT = '../data/processed'
METADATA_PATH = os.path.join(PROCESSED_ROOT, 'processed_frames_metadata.csv')
FEATURES_DIR = '../data/approach2/features'
YAMNET_MODEL_HANDLE = 'https://tfhub.dev/google/yamnet/1'
TARGET_SR = 16000
BATCH_SIZE = 32
RANDOM_SEED = 42

# Set random seeds
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)


In [None]:
# 1. Load YAMNet Model
print("\nLoading YAMNet model from TensorFlow Hub...")
try:
    yamnet_model = hub.load(YAMNET_MODEL_HANDLE)
    print("YAMNet model loaded successfully")
    print(f"Model handle: {YAMNET_MODEL_HANDLE}")
except Exception as e:
    print(f"Error loading YAMNet: {str(e)}")
    raise

In [None]:
# 2. Load Metadata and Prepare Data
print("\nLoading processed frames metadata...")
df = pd.read_csv(METADATA_PATH)
print(f"Loaded {len(df)} frames")

# Create label encoding
categories = sorted(df['category'].unique())
category_to_id = {cat: idx for idx, cat in enumerate(categories)}
id_to_category = {idx: cat for cat, idx in category_to_id.items()}

print(f"\nLabel encoding:")
for cat, idx in category_to_id.items():
    print(f"  {idx}: {cat}")

# Add numeric labels
df['label'] = df['category'].map(category_to_id)

In [None]:
# 3. Create Train/Val/Test Splits (Stratified)

print("\n[3] Creating stratified train/val/test splits...")

# First split: 70% train, 30% temp
train_df, temp_df = train_test_split(
    df, 
    test_size=0.3, 
    stratify=df['label'],
    random_state=RANDOM_SEED
)

# Second split: 15% val, 15% test from temp
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=RANDOM_SEED
)

print(f"\nDataset splits:")
print(f"  Training:   {len(train_df):5d} frames ({len(train_df)/len(df)*100:.1f}%)")
print(f"  Validation: {len(val_df):5d} frames ({len(val_df)/len(df)*100:.1f}%)")
print(f"  Test:       {len(test_df):5d} frames ({len(test_df)/len(df)*100:.1f}%)")

# Check class distribution in each split
print("\nClass distribution per split:")
print(f"{'Category':<20s} {'Train':>8s} {'Val':>8s} {'Test':>8s}")
print("-" * 54)
for cat in categories:
    train_count = len(train_df[train_df['category'] == cat])
    val_count = len(val_df[val_df['category'] == cat])
    test_count = len(test_df[test_df['category'] == cat])
    print(f"{cat:<20s} {train_count:>8d} {val_count:>8d} {test_count:>8d}")


In [None]:
# 4. Extract YAMNet Embeddings

print("\nExtracting YAMNet embeddings...")

def extract_yamnet_embeddings(audio_data, sr=TARGET_SR):
    """
    Extract embeddings from YAMNet's penultimate layer.
    
    Args:
        audio_data: numpy array of audio samples
        sr: sample rate (must be 16000 for YAMNet)
    
    Returns:
        embeddings: (num_frames, 1024) array
        scores: (num_frames, 521) array (original predictions)
    """
    # Ensure float32 and correct shape
    waveform = tf.convert_to_tensor(audio_data, dtype=tf.float32)
    
    # YAMNet expects mono audio at 16kHz
    scores, embeddings, spectrogram = yamnet_model(waveform)
    
    return embeddings.numpy(), scores.numpy()

def process_split(split_df, split_name):
    """Process a data split and extract features."""
    embeddings_list = []
    labels_list = []
    metadata_list = []
    
    print(f"\nProcessing {split_name} set ({len(split_df)} frames)...")
    
    for idx, row in tqdm(split_df.iterrows(), total=len(split_df), desc=f"{split_name}"):
        try:
            # Load audio frame
            frame = np.load(row['frame_path'])
            
            # Extract embeddings
            embeddings, scores = extract_yamnet_embeddings(frame)
            
            # YAMNet processes audio in 0.96s windows, but our frames are already 0.96s
            # So we typically get 1 embedding per frame
            # Take the mean if multiple embeddings (shouldn't happen with 0.96s frames)
            if embeddings.shape[0] > 1:
                embedding = np.mean(embeddings, axis=0)
            else:
                embedding = embeddings[0]
            
            embeddings_list.append(embedding)
            labels_list.append(row['label'])
            metadata_list.append({
                'category': row['category'],
                'label': row['label'],
                'original_file': row['original_file'],
                'frame_idx': row['frame_idx'],
                'aug_idx': row['aug_idx'],
                'is_augmented': row['is_augmented'],
                'frame_path': row['frame_path']
            })
            
        except Exception as e:
            print(f"\n⚠ Error processing {row['frame_path']}: {str(e)}")
            continue
    
    embeddings_array = np.array(embeddings_list)
    labels_array = np.array(labels_list)
    
    print(f"Extracted {len(embeddings_array)} embeddings")
    print(f"  Embedding shape: {embeddings_array.shape}")
    print(f"  Labels shape: {labels_array.shape}")
    
    return embeddings_array, labels_array, metadata_list

# Process each split
train_embeddings, train_labels, train_metadata = process_split(train_df, "Training")
val_embeddings, val_labels, val_metadata = process_split(val_df, "Validation")
test_embeddings, test_labels, test_metadata = process_split(test_df, "Test")


In [None]:
# 5. Save Features

print("\nSaving extracted features...")

os.makedirs(FEATURES_DIR, exist_ok=True)

# Save embeddings and labels
np.save(os.path.join(FEATURES_DIR, 'train_embeddings.npy'), train_embeddings)
np.save(os.path.join(FEATURES_DIR, 'train_labels.npy'), train_labels)

np.save(os.path.join(FEATURES_DIR, 'val_embeddings.npy'), val_embeddings)
np.save(os.path.join(FEATURES_DIR, 'val_labels.npy'), val_labels)

np.save(os.path.join(FEATURES_DIR, 'test_embeddings.npy'), test_embeddings)
np.save(os.path.join(FEATURES_DIR, 'test_labels.npy'), test_labels)

# Save metadata
train_meta_df = pd.DataFrame(train_metadata)
val_meta_df = pd.DataFrame(val_metadata)
test_meta_df = pd.DataFrame(test_metadata)

train_meta_df.to_csv(os.path.join(FEATURES_DIR, 'train_metadata.csv'), index=False)
val_meta_df.to_csv(os.path.join(FEATURES_DIR, 'val_metadata.csv'), index=False)
test_meta_df.to_csv(os.path.join(FEATURES_DIR, 'test_metadata.csv'), index=False)

# Save label mapping
label_mapping = {
    'category_to_id': category_to_id,
    'id_to_category': id_to_category,
    'categories': categories
}
np.save(os.path.join(FEATURES_DIR, 'label_mapping.npy'), label_mapping, allow_pickle=True)

print(f"Features saved to {FEATURES_DIR}")
print(f"  - train_embeddings.npy: {train_embeddings.shape}")
print(f"  - val_embeddings.npy: {val_embeddings.shape}")
print(f"  - test_embeddings.npy: {test_embeddings.shape}")


In [None]:
# 6. Feature Analysis & Visualization
print("\nAnalyzing extracted features...")

# Calculate statistics
print("\nEmbedding statistics:")
print(f"  Training set:")
print(f"    Mean: {np.mean(train_embeddings):.4f}")
print(f"    Std:  {np.std(train_embeddings):.4f}")
print(f"    Min:  {np.min(train_embeddings):.4f}")
print(f"    Max:  {np.max(train_embeddings):.4f}")

# Dimensionality reduction for visualization (t-SNE)
print("\nGenerating t-SNE visualization...")
from sklearn.manifold import TSNE

# Use a subset for faster computation
sample_size = min(2000, len(train_embeddings))
sample_indices = np.random.choice(len(train_embeddings), sample_size, replace=False)

X_sample = train_embeddings[sample_indices]
y_sample = train_labels[sample_indices]

print(f"Computing t-SNE on {sample_size} samples...")
tsne = TSNE(n_components=2, random_state=RANDOM_SEED, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_sample)

# Plot t-SNE
plt.figure(figsize=(12, 8))
colors = plt.cm.Set3(range(len(categories)))

for idx, cat in enumerate(categories):
    mask = y_sample == idx
    plt.scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
                c=[colors[idx]], label=cat, alpha=0.6, s=20)

plt.title('t-SNE Visualization of YAMNet Embeddings', fontsize=16, fontweight='bold')
plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)
plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(FEATURES_DIR, 'tsne_embeddings.png'), dpi=150)
plt.show()

print(f"t-SNE plot saved to {FEATURES_DIR}/tsne_embeddings.png")


In [None]:
# 7. Distribution Plots
print("\nGenerating feature distribution plots...")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Embedding magnitude distribution
axes[0, 0].hist(np.linalg.norm(train_embeddings, axis=1), bins=50, alpha=0.7, color='steelblue')
axes[0, 0].set_title('Distribution of Embedding Magnitudes', fontweight='bold')
axes[0, 0].set_xlabel('L2 Norm')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(True, alpha=0.3)

# First dimension distribution per class
first_dim_data = []
for idx, cat in enumerate(categories):
    mask = train_labels == idx
    first_dim_data.extend([(train_embeddings[mask, 0][i], cat) 
                           for i in range(min(500, mask.sum()))])

first_dim_df = pd.DataFrame(first_dim_data, columns=['value', 'category'])
sns.violinplot(data=first_dim_df, x='category', y='value', ax=axes[0, 1])
axes[0, 1].set_title('First Embedding Dimension per Class', fontweight='bold')
axes[0, 1].tick_params(axis='x', rotation=45)
axes[0, 1].grid(axis='y', alpha=0.3)

# Mean embedding per class
mean_embeddings_per_class = []
for idx, cat in enumerate(categories):
    mask = train_labels == idx
    mean_emb = np.mean(train_embeddings[mask], axis=0)
    mean_embeddings_per_class.append(mean_emb)

mean_embeddings_array = np.array(mean_embeddings_per_class)
im = axes[1, 0].imshow(mean_embeddings_array, aspect='auto', cmap='viridis')
axes[1, 0].set_yticks(range(len(categories)))
axes[1, 0].set_yticklabels(categories)
axes[1, 0].set_xlabel('Embedding Dimension')
axes[1, 0].set_ylabel('Category')
axes[1, 0].set_title('Mean Embeddings Heatmap', fontweight='bold')
plt.colorbar(im, ax=axes[1, 0])

# Class balance in training set
class_counts = [np.sum(train_labels == i) for i in range(len(categories))]
axes[1, 1].bar(categories, class_counts, color='coral')
axes[1, 1].set_title('Training Set Class Balance', fontweight='bold')
axes[1, 1].set_xlabel('Category')
axes[1, 1].set_ylabel('Count')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(FEATURES_DIR, 'feature_analysis.png'), dpi=150)
plt.show()

print(f"Analysis plots saved to {FEATURES_DIR}/feature_analysis.png")

In [None]:
# 8. Summary
print(f"\nSuccessfully extracted YAMNet embeddings")
print(f"  - Training:   {train_embeddings.shape[0]} samples, shape {train_embeddings.shape}")
print(f"  - Validation: {val_embeddings.shape[0]} samples, shape {val_embeddings.shape}")
print(f"  - Test:       {test_embeddings.shape[0]} samples, shape {test_embeddings.shape}")
print(f"\nAll features saved to: {FEATURES_DIR}")
print(f"\nLabel mapping: {len(categories)} classes")
for idx, cat in enumerate(categories):
    print(f"    {idx}: {cat}")