AUDIO EMBEDDINGS (VGGish)

Audio No preprocessing - Save to NPZ

In [None]:
import os
import torch
import numpy as np
from torchvggish import vggish, vggish_input
from tqdm import tqdm  # Optional: for progress bar

# Initialize VGGish model
audio_model = vggish()
audio_model.eval()  # Set to evaluation mode

# Your original function, unmodified
def generateEmbeddingsVGGish(path):
    example = vggish_input.wavfile_to_examples(path)
    embeddings = audio_model.forward(example)
    return embeddings.detach().numpy().reshape(-1)   # Flatten the 0.96 second embeddings into 1

# Paths
audio_folder = 'data/synthetic_audio'
output_file = 'vggish_embeddings.npz'

all_embeddings = []
all_labels = []
label_to_index = {}

# Loop through class subfolders
for idx, class_name in enumerate(sorted(os.listdir(audio_folder))):
    class_path = os.path.join(audio_folder, class_name)
    if not os.path.isdir(class_path):
        continue

    label_to_index[class_name] = idx

    for file_name in tqdm(os.listdir(class_path), desc=f"Processing {class_name}"):
        if not file_name.endswith('.wav'):
            continue
        file_path = os.path.join(class_path, file_name)
        try:
            emb = generateEmbeddingsVGGish(file_path)  # shape [128 * num_chunks]
            all_embeddings.append(emb)
            all_labels.append(idx-1)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Convert to numpy arrays
all_embeddings = np.array(all_embeddings)
all_labels = np.array(all_labels)

# Save to .npz
np.savez(output_file, embeddings=all_embeddings, labels=all_labels, label_map=label_to_index)

print(f"Saved {len(all_embeddings)} embeddings to {output_file}")

Audio Zero-Meaned - save to NPZ and plot tSNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# === Load original audio embeddings ===
data = np.load("vggish_embeddings.npz")
embeddings = data['embeddings']
labels = data['labels']

# === Zero-mean the data ===
scaler = StandardScaler()
embeddings_zm = scaler.fit_transform(embeddings)

# === Save zero-meaned embeddings ===
np.savez("vggish_embeddings-zm.npz", embeddings=embeddings_zm, labels=labels)
print("Saved zero-meaned embeddings to vggish_embeddings-zm.npz")

# === t-SNE for both ===
tsne = TSNE(n_components=2, perplexity=30, random_state=42)

tsne_original = tsne.fit_transform(embeddings)
tsne_zm = tsne.fit_transform(embeddings_zm)

# === Plot side-by-side ===
cmap = plt.get_cmap('tab10')
unique_labels = np.unique(labels)

plt.figure(figsize=(14, 6))

# Original
plt.subplot(1, 2, 1)
for label in unique_labels:
    idx = labels == label
    plt.scatter(tsne_original[idx, 0], tsne_original[idx, 1],
                color=cmap(label % 10), label=f"Class {label}", alpha=0.6, s=40)
plt.title("Original Audio Embeddings (No Zero-Mean)")
plt.grid(True)
plt.legend()

# Zero-Meaned
plt.subplot(1, 2, 2)
for label in unique_labels:
    idx = labels == label
    plt.scatter(tsne_zm[idx, 0], tsne_zm[idx, 1],
                color=cmap(label % 10), label=f"Class {label}", alpha=0.6, s=40)
plt.title("Zero-Meaned Audio Embeddings")
plt.grid(True)

plt.tight_layout()
#plt.savefig("tsne_audio_embeddings_comparison.png")
plt.show()
print("t-SNE comparison plot saved to tsne_audio_embeddings_comparison.png")

IMAGE EMBEDDINGS (VGG16)

Image No preprocessing - save to NPZ

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torchvision import models, transforms as imageT
from PIL import Image
from tqdm import tqdm

# ========== CONFIG ==========
image_folder = 'data/synthetic_shapes'
output_file = 'vgg_image_embeddings.npz'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_classes = 6  # Adjust if needed
image_ft_size = 128  # Your custom output feature size

# ========== LOAD VGG IMAGE MODEL ==========
image_model = models.vgg16(pretrained=False)
image_model.classifier[n_classes] = nn.Linear(4096, image_ft_size)
image_model.load_state_dict(torch.load("synthetic-shapes-model.pth", map_location=device))
image_model.to(device)
image_model.eval()
print("VGG image model: loaded successfully\n")

# ========== IMAGE EMBEDDING FUNCTION ==========
def generateEmbeddingsVGG(image_path):
    transform = imageT.Compose([
        imageT.Resize((224, 224)),
        imageT.ToTensor(),
        imageT.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    img = transform(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
    
    with torch.no_grad():
        return image_model(img).cpu().numpy().reshape(-1)

# ========== PROCESS IMAGES ==========
all_embeddings = []
all_labels = []
label_to_index = {}

for idx, class_name in enumerate(sorted(os.listdir(image_folder))):
    class_path = os.path.join(image_folder, class_name)
    if not os.path.isdir(class_path):
        continue

    label_to_index[class_name] = idx

    for file_name in tqdm(os.listdir(class_path), desc=f"Processing {class_name}"):
        if not file_name.endswith('.png'):
            continue
        file_path = os.path.join(class_path, file_name)
        try:
            emb = generateEmbeddingsVGG(file_path)
            all_embeddings.append(emb)
            all_labels.append(idx-1)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Convert to numpy arrays
all_embeddings = np.array(all_embeddings)
all_labels = np.array(all_labels)

# Save to .npz
np.savez(output_file, embeddings=all_embeddings, labels=all_labels, label_map=label_to_index)

print(f"Saved {len(all_embeddings)} image embeddings to {output_file}")

Image Zero-Meaned - save to NPZ and plot tSNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# === Load original embeddings ===
data = np.load("vgg_image_embeddings.npz")
embeddings = data['embeddings']
labels = data['labels']

# === Zero-mean the data ===
scaler = StandardScaler()
embeddings_zm = scaler.fit_transform(embeddings)

# === Save zero-meaned embeddings ===
np.savez("vgg_image_embeddings-zm.npz", embeddings=embeddings_zm, labels=labels)
print("Saved zero-meaned embeddings to vgg_image_embeddings-zm.npz")

# === t-SNE for both ===
tsne = TSNE(n_components=2, perplexity=30, random_state=42)

tsne_original = tsne.fit_transform(embeddings)
tsne_zm = tsne.fit_transform(embeddings_zm)

# === Plot side-by-side ===
cmap = plt.get_cmap('tab10')
unique_labels = np.unique(labels)

plt.figure(figsize=(14, 6))

# Original
plt.subplot(1, 2, 1)
for label in unique_labels:
    idx = labels == label
    plt.scatter(tsne_original[idx, 0], tsne_original[idx, 1],
                color=cmap(label % 10), label=f"Class {label}", alpha=0.6, s=40)
plt.title("Original Embeddings (No Zero-Mean)")
plt.grid(True)
plt.legend()

# Zero-Meaned
plt.subplot(1, 2, 2)
for label in unique_labels:
    idx = labels == label
    plt.scatter(tsne_zm[idx, 0], tsne_zm[idx, 1],
                color=cmap(label % 10), label=f"Class {label}", alpha=0.6, s=40)
plt.title("Zero-Meaned Embeddings")
plt.grid(True)

plt.tight_layout()
#plt.savefig("tsne_image_embeddings_comparison.png")
plt.show()
print("t-SNE comparison plot saved to tsne_image_embeddings_comparison.png")

Message consistency within class experiment - save dual labels to NPZ (ex. frequency bucket 1, frequency bucket 2, etc)

Dual-label audio: Frequency, No preprocessing

In [None]:
import os
import torch
import numpy as np
from torchvggish import vggish, vggish_input
from tqdm import tqdm

# Initialize VGGish model
audio_model = vggish()
audio_model.eval()

def generateEmbeddingsVGGish(path):
    example = vggish_input.wavfile_to_examples(path)
    embeddings = audio_model.forward(example)
    return embeddings.detach().numpy().reshape(-1)

# Paths
audio_folder = 'data/output_dataset'
output_file = 'vggish_dual_labels_embeddings.npz'

all_embeddings = []
shape_labels = []
freq_labels = []

shape_label_map = {}
freq_label_map = {'0': 0, '1': 1, '2': 2}  # Frequency classes are already 0,1,2

# Loop through shape classes
for shape_idx, shape_class in enumerate(sorted(os.listdir(audio_folder))):
    shape_path = os.path.join(audio_folder, shape_class)
    if not os.path.isdir(shape_path):
        continue

    shape_label_map[shape_class] = shape_idx

    for freq_class in sorted(os.listdir(shape_path)):
        freq_path = os.path.join(shape_path, freq_class)
        if not os.path.isdir(freq_path):
            continue

        for file_name in tqdm(os.listdir(freq_path), desc=f"{shape_class}/{freq_class}"):
            if not file_name.endswith('.wav'):
                continue
            file_path = os.path.join(freq_path, file_name)
            try:
                emb = generateEmbeddingsVGGish(file_path)
                all_embeddings.append(emb)
                shape_labels.append(shape_idx)
                freq_labels.append(freq_label_map[freq_class])
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Convert to numpy arrays
all_embeddings = np.array(all_embeddings)
shape_labels = np.array(shape_labels)
freq_labels = np.array(freq_labels)

# Save everything
np.savez(output_file, 
         embeddings=all_embeddings, 
         shape_labels=shape_labels, 
         freq_labels=freq_labels, 
         shape_label_map=shape_label_map, 
         freq_label_map=freq_label_map)

print(f"Saved {len(all_embeddings)} embeddings with dual labels to {output_file}")

Dual-label audio: Frequency, PCA

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Load original embeddings
data = np.load('vggish_dual_labels_embeddings.npz', allow_pickle=True)

embeddings = data['embeddings']
shape_labels = data['shape_labels']
freq_labels = data['freq_labels']
shape_label_map = data['shape_label_map'].item()
freq_label_map = data['freq_label_map'].item()

print(f"Original embedding shape: {embeddings.shape}")

# Apply PCA to reduce to 128 dimensions
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(embeddings)

print(f"Reduced embedding shape: {reduced_embeddings.shape}")

# Save to new .npz file
np.savez('vggish_dual_labels_embeddings-pca.npz',
         embeddings=reduced_embeddings,
         shape_labels=shape_labels,
         freq_labels=freq_labels,
         shape_label_map=shape_label_map,
         freq_label_map=freq_label_map)

print("Saved PCA-reduced embeddings to 'vggish_dual_labels_embeddings-pca.npz'")


Dual-label audio: Amplitude, No preprocessing

In [None]:
import os
import torch
import numpy as np
from torchvggish import vggish, vggish_input
from tqdm import tqdm

# Initialize VGGish model
audio_model = vggish()
audio_model.eval()

def generateEmbeddingsVGGish(path):
    example = vggish_input.wavfile_to_examples(path)
    embeddings = audio_model.forward(example)
    return embeddings.detach().numpy().reshape(-1)

# Paths
audio_folder = 'data/output_dataset_amplitude'
output_file = 'vggish_dual_labels_embeddings_amp.npz'

all_embeddings = []
shape_labels = []
freq_labels = []

shape_label_map = {}
freq_label_map = {'0': 0, '1': 1, '2': 2}  # Frequency classes are already 0,1,2

# Loop through shape classes
for shape_idx, shape_class in enumerate(sorted(os.listdir(audio_folder))):
    shape_path = os.path.join(audio_folder, shape_class)
    if not os.path.isdir(shape_path):
        continue

    shape_label_map[shape_class] = shape_idx

    for freq_class in sorted(os.listdir(shape_path)):
        freq_path = os.path.join(shape_path, freq_class)
        if not os.path.isdir(freq_path):
            continue

        for file_name in tqdm(os.listdir(freq_path), desc=f"{shape_class}/{freq_class}"):
            if not file_name.endswith('.wav'):
                continue
            file_path = os.path.join(freq_path, file_name)
            try:
                emb = generateEmbeddingsVGGish(file_path)
                all_embeddings.append(emb)
                shape_labels.append(shape_idx)
                freq_labels.append(freq_label_map[freq_class])
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

# Convert to numpy arrays
all_embeddings = np.array(all_embeddings)
shape_labels = np.array(shape_labels)
freq_labels = np.array(freq_labels)

# Save everything
np.savez(output_file, 
         embeddings=all_embeddings, 
         shape_labels=shape_labels, 
         freq_labels=freq_labels, 
         shape_label_map=shape_label_map, 
         freq_label_map=freq_label_map)

print(f"Saved {len(all_embeddings)} embeddings with dual labels to {output_file}")

Dual-label audio: Amplitude, PCA

In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Load original embeddings
data = np.load('vggish_dual_labels_embeddings_amp.npz', allow_pickle=True)

embeddings = data['embeddings']
shape_labels = data['shape_labels']
freq_labels = data['freq_labels']
shape_label_map = data['shape_label_map'].item()
freq_label_map = data['freq_label_map'].item()

print(f"Original embedding shape: {embeddings.shape}")

# Apply PCA to reduce to 128 dimensions
pca = PCA(n_components=128)
reduced_embeddings = pca.fit_transform(embeddings)

print(f"Reduced embedding shape: {reduced_embeddings.shape}")

# Save to new .npz file
np.savez('vggish_dual_labels_embeddings_amp-pca.npz',
         embeddings=reduced_embeddings,
         shape_labels=shape_labels,
         freq_labels=freq_labels,
         shape_label_map=shape_label_map,
         freq_label_map=freq_label_map)

print("Saved PCA-reduced embeddings to 'vggish_dual_labels_embeddings_amp-pca.npz'")