# Test the embedding model

The embedding odel code was generted by Claude.
Here I'm testing it to see how it works.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from typing import List, Tuple, Optional

from embedderv5 import *

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Generate sample data
sequences, similarity_features = generate_sample_data(
    num_instances=1000, min_R=5, max_R=20, C=32, similarity_dim=16
)
for k, seq in enumerate(sequences):
    print('sequence %d shape:' % k, sequences[k].shape)

# print(len(similarity_features))
# print(similarity_features[0].shape)
# print(similarity_features[1].shape)


In [None]:
# Create dataset and dataloader with explicit triplet sampling
dataset = ContrastivePairDataset(
    sequences, 
    similarity_features, 
    similarity_threshold=0.5,  # Adjust based on your similarity features
    num_negatives=2  # Number of negatives per anchor
)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=triplet_collate_fn)


In [None]:
# Initialize model
model = PermutationInvariantModel(
    input_dim=32,
    hidden_dim=128,
    embedding_dim=64,
    num_attention_heads=4,  # Now using 4 attention heads
    num_linear_layers=3,
    dropout=0.1
)

print(f"Model has {sum(p.numel() for p in model.parameters())} parameters")


In [None]:
# Train the model
train_model(model, train_loader, num_epochs=20, learning_rate=1e-3, device=device)


In [None]:
# Example inference
model.eval()
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    # Unpack the dictionary structure from triplet data loader
    anchor_seqs, anchor_masks, anchor_sims = sample_batch['anchor']
    anchor_seqs = anchor_seqs.to(device)
    anchor_masks = anchor_masks.to(device)
    
    # Generate embeddings for anchor samples
    embeddings = model(anchor_seqs, anchor_masks)
    print(f"Generated embeddings shape: {embeddings.shape}")
    print(f"Sample embedding norm: {torch.norm(embeddings[0]).item():.4f}")


In [None]:
        
# Example inference
model.eval()
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    # Unpack the dictionary structure from triplet data loader
    anchor_seqs, anchor_masks, anchor_sims = sample_batch['anchor']
    anchor_seqs = anchor_seqs.to(device)
    anchor_masks = anchor_masks.to(device)
    
    # Generate embeddings for anchor samples
    embeddings = model(anchor_seqs, anchor_masks)
    print(f"Generated embeddings shape: {embeddings.shape}")
    print(f"Sample embedding norm: {torch.norm(embeddings[0]).item():.4f}")
    
    # Get embeddings for positives
    pos_seqs, pos_masks, pos_sims = sample_batch['positive']
    pos_seqs, pos_masks = pos_seqs.to(device), pos_masks.to(device)
    pos_embeddings = model(pos_seqs, pos_masks)
    print(f"Positive embeddings shape: {pos_embeddings.shape}")
    
    # Get embeddings for negatives
    neg_seqs, neg_masks, neg_sims, neg_batch_indices = sample_batch['negatives']
    neg_seqs, neg_masks = neg_seqs.to(device), neg_masks.to(device)
    neg_batch_indices = neg_batch_indices.to(device)
    neg_embeddings = model(neg_seqs, neg_masks)
    print(f"Negative embeddings shape: {neg_embeddings.shape}")
    
    # Check similarity between anchors and positives
    pos_similarities = F.cosine_similarity(embeddings, pos_embeddings, dim=1)
    print(f"Anchor-Positive similarities: {pos_similarities.mean().item():.4f} ± {pos_similarities.std().item():.4f}")
    
    # Check similarity between anchors and negatives
    batch_size = embeddings.shape[0]
    neg_similarities_all = []
    
    for i in range(batch_size):
        # Get negatives for this anchor
        neg_mask = neg_batch_indices == i
        if neg_mask.sum() > 0:
            anchor_i = embeddings[i:i+1]  # (1, embedding_dim)
            negatives_i = neg_embeddings[neg_mask]  # (num_negs, embedding_dim)
            
            # Compute similarities between this anchor and its negatives
            neg_sims_i = F.cosine_similarity(
                anchor_i.expand_as(negatives_i), negatives_i, dim=1
            )
            neg_similarities_all.extend(neg_sims_i.cpu().tolist())
    
    if len(neg_similarities_all) > 0:
        neg_similarities = torch.tensor(neg_similarities_all)
        print(f"Anchor-Negative similarities: {neg_similarities.mean().item():.4f} ± {neg_similarities.std().item():.4f}")
        
        # Show the difference (should be positive if model is learning well)
        print(f"Positive vs Negative similarity difference: {pos_similarities.mean().item() - neg_similarities.mean().item():.4f}")
    else:
        print("No negative samples found in this batch")


In [None]:
# Example inference
model.eval()
with torch.no_grad():
    sample_batch = next(iter(train_loader))
    # Unpack the dictionary structure from triplet data loader
    anchor_seqs, anchor_masks, anchor_sims = sample_batch['anchor']
    anchor_seqs = anchor_seqs.to(device)
    anchor_masks = anchor_masks.to(device)
    
    # Generate embeddings for anchor samples
    embeddings = model(anchor_seqs, anchor_masks)
    print(f"Generated embeddings shape: {embeddings.shape}")
    print(f"Sample embedding norm: {torch.norm(embeddings[0]).item():.4f}")
    
    # You can also get embeddings for positives and negatives
    pos_seqs, pos_masks, pos_sims = sample_batch['positive']
    pos_seqs, pos_masks = pos_seqs.to(device), pos_masks.to(device)
    pos_embeddings = model(pos_seqs, pos_masks)
    print(f"Positive embeddings shape: {pos_embeddings.shape}")
    
    # Check similarity between anchors and positives
    similarities = F.cosine_similarity(embeddings, pos_embeddings, dim=1)
    print(f"Anchor-Positive similarities: {similarities.mean().item():.4f} ± {similarities.std().item():.4f}")

    # You can also get embeddings for negatives
    neg_seqs, neg_masks, neg_sims = sample_batch['negatives']
    neg_seqs, neg_masks = negs_seqs.to(device), neg_masks.to(device)
    neg_embeddings = model(neg_seqs, neg_masks)
    print(f"Positive embeddings shape: {neg_embeddings.shape}")
    
    # Check similarity between anchors and negatives
    similarities = F.cosine_similarity(embeddings, neg_embeddings, dim=1)
    print(f"Anchor-Negative similarities: {similarities.mean().item():.4f} ± {similarities.std().item():.4f}")

In [None]:
a = sample_batch['positive']
type(a)

In [None]:
b = a[0]
b.shape