In [1]:
import torch
import torch.nn.functional as F
from data_utils import GuidedGridMLMDataset, GuidedGridMLM_collate_fn
from torch.utils.data import DataLoader
from GridMLM_tokenizers import GuidedGridMLMTokenizer
from models import GuidedMLMH
from tqdm import tqdm
import numpy as np
from generate_utils import load_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dir = '/media/maindisk/data/hooktheory_hr/hooktheory_CA_train'

tokenizer = GuidedGridMLMTokenizer(fixed_length=256)
train_dataset = GuidedGridMLMDataset(train_dir, tokenizer, 512, frontloading=True)

Loading data file.


In [3]:
subfolder = 'unf_CA'
device_name = 'cuda:2'
curriculum_type = 'random'
ablation = 'all'

In [4]:
model = load_model( curriculum_type, subfolder, ablation, device_name, tokenizer )

In [5]:
all_z_list = []
for d in tqdm(train_dataset):
    z, mu, logvar, recon_seq, z_proj = model.vae(torch.LongTensor([d['input_ids']]).to(model.device)) # TODO: do we need to detach?
    z_dmodel = model.guidance_to_dmodel(z)
    all_z_list.append( z_dmodel.tolist() )

100%|██████████| 9105/9105 [00:50<00:00, 182.08it/s]


In [6]:
all_z = torch.FloatTensor(all_z_list)
print(all_z.shape)

torch.Size([9105, 1, 512])


In [None]:
def analyze_film_modulation(film_layer, all_z, threshold=0.1):
    with torch.no_grad():
        # W_gamma = film_layer.film_gamma.weight      # (D_out, D_z)
        # b_gamma = film_layer.film_gamma.bias        # (D_out)
        # W_beta = film_layer.film_beta.weight
        # b_beta = film_layer.film_beta.bias

        # # Compute gamma(z) and beta(z)
        # gamma_vals = (all_z @ W_gamma.T + b_gamma)  # (N, D_out)
        # beta_vals = (all_z @ W_beta.T + b_beta)

        gamma_vals = film_layer.film_gamma(all_z)
        beta_vals = film_layer.film_beta(all_z)

        # Metrics
        gamma_diff_1 = torch.norm(gamma_vals - 1.0, dim=1)  # Distance from identity
        gamma_diff_0 = torch.norm(gamma_vals, dim=1)  # Distance from cancelling
        beta_norm = torch.norm(beta_vals, dim=1)

        near_identity = ((gamma_diff_1 < threshold) & (beta_norm < threshold)).all(dim=1) | \
            ((gamma_diff_0 < threshold) & (beta_norm < threshold)).all(dim=1)
        print('near_identity: ', near_identity.shape)
        percent_near_identity = 100 * near_identity.sum().item() / all_z.size(0)

        print(f"Layer: {film_layer.__class__.__name__}")
        print(f"γ values: min {gamma_vals.min():.3f}, max {gamma_vals.max():.3f}, mean {gamma_vals.mean():.3f}")
        print(f"β values: min {beta_vals.min():.3f}, max {beta_vals.max():.3f}, mean {beta_vals.mean():.3f}")
        print(f"Proportion near-identity FiLM: {percent_near_identity:.2f}%")

        return gamma_vals, beta_vals


In [12]:
# all_z = collect_z_vectors_from_training_data(model.vae, train_loader)

for i, layer in enumerate(model.encoder.layers):
    print(f"\n=== Layer {i} ===")
    analyze_film_modulation(layer, all_z.to(model.device))



=== Layer 0 ===
gamma_vals:  torch.Size([9105, 1, 512])
gamma_diff:  torch.Size([9105, 512])
near_identity:  torch.Size([9105])
Layer: FiLMTransformerEncoderLayer
γ values: min -2.020, max 2.264, mean 0.030
β values: min -1.213, max 1.250, mean -0.002
Proportion near-identity FiLM: 0.00%

=== Layer 1 ===
gamma_vals:  torch.Size([9105, 1, 512])
gamma_diff:  torch.Size([9105, 512])
near_identity:  torch.Size([9105])
Layer: FiLMTransformerEncoderLayer
γ values: min -2.299, max 1.968, mean -0.006
β values: min -1.186, max 1.350, mean 0.004
Proportion near-identity FiLM: 0.00%

=== Layer 2 ===
gamma_vals:  torch.Size([9105, 1, 512])
gamma_diff:  torch.Size([9105, 512])
near_identity:  torch.Size([9105])
Layer: FiLMTransformerEncoderLayer
γ values: min -2.079, max 2.161, mean -0.025
β values: min -1.413, max 1.268, mean 0.003
Proportion near-identity FiLM: 0.00%

=== Layer 3 ===
gamma_vals:  torch.Size([9105, 1, 512])
gamma_diff:  torch.Size([9105, 512])
near_identity:  torch.Size([9105])
L