# Test the embedding model

The embedding odel code was generted by Claude.
Here I'm testing it to see how it works.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from typing import List, Tuple, Optional

from embedderv3 import *

In [None]:
# Example usage and data generation
def generate_sample_data(num_instances=1000, min_R=5, max_R=20, C=32, similarity_dim=16):
    """
    Generate sample data for testing.
    """
    sequences = []
    similarity_features = []
    
    for i in range(num_instances):
        R = np.random.randint(min_R, max_R + 1)
        sequence = np.random.randn(R, C).astype(np.float32)
        sequences.append(sequence)
        
        # Generate similarity features
        sim_feature = np.random.randn(similarity_dim).astype(np.float32)
        similarity_features.append(sim_feature)
    
    return sequences, np.array(similarity_features)

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Generate sample data
sequences, similarity_features = generate_sample_data(
    num_instances=1000, min_R=5, max_R=20, C=32, similarity_dim=16
)
print(len(sequences))
print(sequences[0].shape)
print(sequences[1].shape)

print(len(similarity_features))
print(similarity_features[0].shape)
print(similarity_features[1].shape)


In [None]:
# Create dataset and dataloader
dataset = VariableLengthDataset(sequences, similarity_features)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [None]:
# Initialize model
model = PermutationInvariantModel(
    input_dim=32,
    hidden_dim=128,
    embedding_dim=64,
    num_attention_heads=4,  # Now using 4 attention heads
    num_linear_layers=3,
    dropout=0.1
)

print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")


In [None]:
# Train the model
train_model(model, train_loader, num_epochs=50, learning_rate=1e-3, device=device)


In [None]:
# Example inference
model.eval()
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    sequences, masks, similarity_features = sample_batch
    sequences = sequences.to(device)
    masks = masks.to(device)
    
    embeddings = model(sequences, masks)
    print(f"Generated embeddings shape: {embeddings.shape}")
    print(f"Sample embedding norm: {torch.norm(embeddings[0]).item():.4f}")

In [None]:
embeddings[0]