In [1]:
"""
Script for training SAE from a cache of scgpt activations and loading the custom fidelity function.

"""

import warnings
from pathlib import Path
import sys
sys.path.append(str(Path("./").resolve().parent))

import torch
from torch.utils.data import DataLoader

from sae.dictionary import AutoEncoder
from fidelity import get_loss_recovery_fn
from load_sharded_acts import LazyMultiDirectoryTokenDataset
from trainer import StandardTrainer
from training import train_run
from typing import Optional

warnings.filterwarnings("ignore", message="TypedStorage is deprecated")

def get_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    else:
        return "cpu"


def train_SAE_on_gfm_embeds(
    # Data paths and sources
    gfm_embd_dir: Path,
    eval_seq_path: Optional[Path] = None,
    # Core model architecture
    expansion_factor: int = 8,
    # Training configuration
    batch_size: int = 32,
    steps: int = 2_000,
    seed: int = 0,
    # Optimization parameters
    lr: float = 1e-3,
    warmup_steps: int = 50,
    resample_steps: int = 0,  # 0 to disable
    # Regularization
    l1_penalty: float = 1e-1,
    l1_annealing_pct: float = 0.05,
    # Evaluation settings
    eval_batch_size: int = 128,
    eval_steps: int = 1_000,
    # Logging and checkpointing
    save_dir: str = "models",
    log_steps: int = 100,
    save_steps: int = 50,
    max_ckpts_to_keep: int = 3,
    # Weights & Biases configuration
    use_wandb: bool = True,
    wandb_entity: str = "yunfei-hu-vanderbilt-university",
    wandb_project: str = "test_logging",
    wandb_name: str = "SAE",
):
    """
    Train a Sparse Autoencoder (SAE) using cached activation data from a language model.

    Args:
        # Data paths and sources
        gfm_embd_dir: Directory containing cached model embeddings
        eval_seq_path: Path to sequences for fidelity evaluation, if None, fidelity evaluation is disabled

        # Core model architecture
        expansion_factor: Factor by which to expand the dictionary size relative to input dimension

        # Training configuration
        batch_size: Number of samples per training batch
        steps: Total number of training steps
        seed: Random seed for reproducibility

        # Optimization parameters
        lr: Learning rate for optimizer
        warmup_steps: Number of warmup steps for learning rate scheduler
        resample_steps: Steps between dictionary resampling (0 to disable)

        # Regularization
        l1_penalty: Coefficient for L1 regularization
        l1_annealing_pct: Percentage of training during which to anneal L1 penalty

        # Evaluation settings
        eval_batch_size: Batch size for evaluation
        eval_steps: Frequency of evaluation steps

        # Logging and checkpointing
        save_dir: Directory to save model checkpoints and outputs
        log_steps: Frequency of logging
        save_steps: Frequency of saving checkpoints

        # Weights & Biases configuration
        use_wandb: Whether to use Weights & Biases logging
        wandb_entity: W&B username or team name
        wandb_project: W&B project name
        wandb_name: W&B run name
    """
    device = get_device()

    def collate_fn(batch):
        return torch.stack(batch).to(device)

    # Initialize dataset and dataloader
    acts_dataset = LazyMultiDirectoryTokenDataset(gfm_embd_dir)

    # Determine layer from dataset metadata
    layer = acts_dataset.datasets[0]["layer"]
    plm_name = acts_dataset.datasets[0]["plm_name"]
    print(f"Using activations from layer {layer} of {plm_name}")

    dataloader = DataLoader(
        acts_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
    )
    print(f"Loaded dataset with {len(acts_dataset):,} tokens")

    # Configure resampling
    if resample_steps == 0:
        resample_steps = None

    # Setup trainer configuration
    trainer = StandardTrainer(
        activation_dim=acts_dataset.d_model,
        dict_size=acts_dataset.d_model * expansion_factor,
        warmup_steps=warmup_steps,
        resample_steps=resample_steps,
        lr=lr,
        l1_penalty=l1_penalty,
        l1_annealing_pct=l1_annealing_pct,
        seed=seed,
        wandb_name=wandb_name,
        layer=layer,
        plm_name=plm_name,
        device=device,
        steps=min(steps, len(dataloader)),
    )
    print(f"Training with config: {trainer.config}")

    # Initialize fidelity function if evaluation sequences provided
    if eval_seq_path is not None:
        fidelity_fn = get_loss_recovery_fn(
            esm_model_name=plm_name,
            layer_idx=int(layer),
            eval_seq_path=eval_seq_path,
            device=device,
            batch_size=eval_batch_size,
        )
    else:
        fidelity_fn = None

    # Train the SAE
    train_run(
        # Core training components
        data=dataloader,
        trainer=trainer,
        # Evaluation settings
        fidelity_fn=fidelity_fn,
        eval_steps=eval_steps,
        # Logging and checkpointing
        save_dir=save_dir,
        log_steps=log_steps,
        save_steps=save_steps,
        max_ckpts_to_keep=3,
        # Weights & Biases configuration
        use_wandb=use_wandb,
        wandb_entity=wandb_entity,
        wandb_project=wandb_project,
        additional_wandb_args={
            "eval_seq_path": eval_seq_path,
            "eval_steps": eval_steps,
            "batch_size": batch_size,
            "save_dir": save_dir,
        },
    )

    # print(save_dir)

In [4]:
# Loop through all 12 layers and train SAE on each
for layer in range(12):
    gfm_embd_dir = f"/maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/activations/layer_{layer}"
    save_dir = f"/maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer{layer+1}"
    
    print(f"=== Training SAE on layer {layer} embeddings ===")
    train_SAE_on_gfm_embeds(
        gfm_embd_dir=gfm_embd_dir,
        save_dir=save_dir,
        wandb_entity="yunfei-hu-vanderbilt-university",
        steps=5000
    )
    print(f"Finished training SAE on layer {layer}, results saved to {save_dir}\n")


=== Training SAE on layer 0 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 6452.28it/s]

Using activations from layer layer_0 of scgpt
Loaded dataset with 30,588,064 tokens





Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_0', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myunfei-hu[0m ([33myunfei-hu-vanderbilt-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 5000/5000 [00:12<00:00, 395.05it/s]

Stopped training because reached max specified steps





0,1
act_mean,▇▇▇▇▆▇▇▇█▇▇▇▅▆▇▃█▇▃▃▂▇▆▃▂▁▇▅▂▂▃▄▁▆▆▇▅▃▂▄
act_std,████▅▇█▇▇█▇▇█▆▇▇▆▇█▆▅▆▇▇▆▅▃▇▆▆▅▆▁▇▇▇▆▆▅▅
frac_variance_explained,▁▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇███ ███ ▇ █ ███
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▄▃▃▃▂▃▂▂▂▂▂▂▂▁▂▂▁▂▂▂▁▁▂▁▁▁▁▁▁▂▂▁▂▁▁▁▁
loss,█▅▅▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▁▁▂▂▂▁▁▁▂▁▁▁▁▁▁▂▂▂▂▁▁▁
lr,▁███████████████████████████████████████
mse_loss,█▃▃▂▂▂▂▂▁▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.00389
act_std,0.92017
frac_variance_explained,
l0,5.0
l0_pct_nonzero,0.12207
l1_penalty,0.1
l2_loss,3.49993
loss,4.84832
lr,0.001
mse_loss,12.24949


Finished training SAE on layer 0, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer1

=== Training SAE on layer 1 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7523.42it/s]

Using activations from layer layer_1 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_1', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 429.18it/s]

Stopped training because reached max specified steps





0,1
act_mean,▄▅▆▅▆▇▅▆▇▆█▆▅▄▄▆▄▅▆▃▅█▆▅▆▁▆▇▅▅▇▇▆▁▇▆▅▅▆▇
act_std,▇▇▇▇▆▇▇▇▇▇▇▇▅▆█▅▇█▅▅▅██▅▅▂▇▅▄▅▅▅▅▁▆▇▇▆▆▄
frac_variance_explained,▁▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇███ ███ ▇ █ ████
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▄▃▃▂▃▃▃▃▃▂▂▂▁▂▂▁▁▂▂▂▁▁▁▂▁▁▁▁▁▁▂▂▂▂▁▂▁
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▁▁▂▂▁▁▁▂▁▁▁▁▁▁▂▂▂▂▂▁▂▂
lr,▁███████████████████████████████████████
mse_loss,█▃▂▂▂▂▂▁▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.00483
act_std,0.88966
frac_variance_explained,
l0,5.0
l0_pct_nonzero,0.12207
l1_penalty,0.1
l2_loss,4.00122
loss,5.41861
lr,0.001
mse_loss,16.0098


Finished training SAE on layer 1, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer2

=== Training SAE on layer 2 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7559.57it/s]

Using activations from layer layer_2 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_2', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 441.42it/s]

Stopped training because reached max specified steps





0,1
act_mean,▂▂▂▄▃▄▂▃▄▃▄▂▂▂▄▅▄▃▆▃▄▂▅▇▆▄▆▅▇▄▇▇▁▃▅▄▅▄▇█
act_std,▇▅▇▆▇▆▆█▇▇▇▇▄▆█▅██▄▄██▅▅▄▇▅▄▅▄▅▅▁▆█▇▆▅▅▅
frac_variance_explained,▁▆▆▆▇▇▇▇▇▇▇▇▇▇▇███ ███ █ ██▇█
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▃▃▃▂▃▃▃▃▂▂▂▃▂▂▂▁▁▂▂▂▁▁▂▂▁▁▁▁▂▂▂▂▂▁▁▁▂
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▁▂▂▂▂▂▁▂▁▁▁▁▁▂▂▂▂▂▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▂▂▂▂▂▂▁▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.00465
act_std,0.91629
frac_variance_explained,
l0,6.0
l0_pct_nonzero,0.14648
l1_penalty,0.1
l2_loss,5.34379
loss,6.86204
lr,0.001
mse_loss,28.55607


Finished training SAE on layer 2, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer3

=== Training SAE on layer 3 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 6871.21it/s]

Using activations from layer layer_3 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_3', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 436.38it/s]

Stopped training because reached max specified steps





0,1
act_mean,▂▃▁▂▂▂▂▂▃▂▂▂▃▂▂▁▅▃▂▇▄█▄▂▆▆▄▄▄▆▇▇▇▅▂▃▃▃▆▆
act_std,▇▇▅▇▆█▆▆█▇▇▇▇▄▆▆▅▇█▅▄▅██▅▄▃▇▅▄▅▅▄▁▆▆▇▅▄▄
frac_variance_explained,▁▆▆▇▇▇▇▇▇▇▇▇▇▇▇██▇ ██ █ █ ██▇█
l0,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▃▃▃▃▂▃▃▃▃▂▂▂▃▂▂▂▁▁▂▂▁▁▂▂▁▁▁▁▂▁▂▂▂▂▂▂▁▂
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▃▁▁▂▂▃▁▁▂▂▁▁▁▂▁▂▂▂▃▂▂▁▂
lr,▁███████████████████████████████████████
mse_loss,██▅▄▅▄▄▄▄▄▄▃▃▃▄▂▂▁▁▂▂▃▁▁▂▃▁▁▁▁▂▂▂▃▂▃▂▂▁▂

0,1
act_mean,0.00421
act_std,0.92976
frac_variance_explained,
l0,5.0
l0_pct_nonzero,0.12207
l1_penalty,0.1
l2_loss,5.78936
loss,7.35749
lr,0.001
mse_loss,33.51672


Finished training SAE on layer 3, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer4

=== Training SAE on layer 4 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7422.23it/s]

Using activations from layer layer_4 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_4', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 447.12it/s]

Stopped training because reached max specified steps





0,1
act_mean,▂▄▂▃▂▂▂▃▃▃▃▁▂▃▂▂▆▄▇▄█▄▄▆█▆▄▅▆█▇▆▇▅▄▅▆▅▇▆
act_std,▇█▅▇▆█▆▆▇▇▇▇▇▅▆▅▆█▅▃▅▇█▅▄▃▇▃▅▅▃▅▁▆▇▇▅▃▅▃
frac_variance_explained,▁▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇███ ███ █ █ ████
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▁▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▂▃▂▂▂▂
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▃▃▁▂▂▂▃▂▂▂▃▁▁▂▁▂▂▂▃▂▃▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁

0,1
act_mean,0.00309
act_std,0.94383
frac_variance_explained,
l0,4.0
l0_pct_nonzero,0.09766
l1_penalty,0.1
l2_loss,5.12539
loss,6.42295
lr,0.001
mse_loss,26.26965


Finished training SAE on layer 4, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer5

=== Training SAE on layer 5 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 6964.00it/s]

Using activations from layer layer_5 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_5', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:10<00:00, 464.44it/s]

Stopped training because reached max specified steps





0,1
act_mean,▃▁▃▃▄▂▂▃▃▃▂▂▄▁▄▇▃▄█▄▇▂▃▇▆▄▆█▇▇▅▇▂▄▃▃▅▇▇▇
act_std,▇▇▅▆▆▅▅▆▇▇▆▆▅▅█▄▆▇▅▃▆▆▇▅▄▄▇▅▃▅▆▃▆▁▆▆▇▃▄▄
frac_variance_explained,▁▆▆▇▇▇▇▇▇▇▇▇▇▇▇███ ███ █ █ ██▇█
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▃▃▃▃▂▃▃▃▃▃▃▃▂▂▁▂▂▂▂▂▂▂▂▁▁▁▂▂▂▂▂▂▃▂▁▂▂
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▃▁▂▂▃▂▂▂▂▁▁▁▂▂▂▂▂▂▃▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▇▅▄▄▄▃▄▄▄▄▄▄▄▄▂▃▃▁▃▃▃▂▂▂▃▁▁▁▂▂▂▂▃▃▃▂▁▂▂

0,1
act_mean,0.00302
act_std,0.95176
frac_variance_explained,
l0,6.0
l0_pct_nonzero,0.14648
l1_penalty,0.1
l2_loss,6.11162
loss,7.68239
lr,0.001
mse_loss,37.35184


Finished training SAE on layer 5, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer6

=== Training SAE on layer 6 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7496.30it/s]

Using activations from layer layer_6 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_6', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:10<00:00, 461.05it/s]


Stopped training because reached max specified steps


0,1
act_mean,▃▃▃▃▃▂▂▂▂▃▂▁▂▄▂▃▆▂▃▇▆█▂▂▅▅█▃▆▆▇▅▆▄▂▂▅▆▆▇
act_std,▇▅▆▆▇▅▆▆█▆▆▅▅█▆▅▇▅▂▅▅▇▅▃▂▆▅▃▅▆▂▆▁▆▅▇▄▃▄▄
frac_variance_explained,▁▃▄▅▄▄▆▅▆▆▆▆▅▆▆▇▇ ▇▆▇ ▇ █ ▇▇▇▇
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▃▃▃▃▂▃▃▃▃▃▃▃▂▂▂▁▂▂▂▃▂▁▂▂▁▁▁▂▂▂▂▂▃▂▁▁▂
loss,█▅▄▄▃▃▃▃▃▃▃▃▃▃▃▂▃▃▁▂▂▃▃▂▁▂▁▁▁▁▂▂▂▃▂▃▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▇▆▄▅▅▄▄▄▄▅▄▄▄▄▂▃▃▁▂▂▃▂▂▂▃▁▁▁▁▂▂▂▃▃▄▂▁▁▂

0,1
act_mean,0.00273
act_std,0.96754
frac_variance_explained,
l0,6.0
l0_pct_nonzero,0.14648
l1_penalty,0.1
l2_loss,6.33029
loss,7.74156
lr,0.001
mse_loss,40.07251


Finished training SAE on layer 6, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer7

=== Training SAE on layer 7 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7652.44it/s]

Using activations from layer layer_7 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_7', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 440.81it/s]

Stopped training because reached max specified steps





0,1
act_mean,▂▂▂▄▃▂▁▂▂▃▂▂▃▃▁▆▂▃█▄▇▃▃▆▆▃▇▇▆▇▅▇▄▃▂▃▅▅▆█
act_std,▇▇▆▇▆▇▅▆▅▆▆▆▆▆▆▃▄▇▅▁█▅▆▅▃▆▄▂▆▆▂▆▅▆▅▆▆▂▅▅
frac_variance_explained,▁▆▆▇▇▇▇▇▇▇▇▇▇▇▇███ ███ █ █ ███
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▄▄▃▃▃▃▂▃▃▂▃▃▃▃▃▂▂▂▁▁▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▂▁▁▂
loss,█▅▄▃▃▃▃▃▃▃▃▃▃▃▃▃▃▁▂▂▃▃▂▂▂▃▂▁▂▂▂▂▂▃▂▃▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁

0,1
act_mean,0.00268
act_std,0.98829
frac_variance_explained,
l0,5.0
l0_pct_nonzero,0.12207
l1_penalty,0.1
l2_loss,7.84433
loss,9.23009
lr,0.001
mse_loss,61.53343


Finished training SAE on layer 7, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer8

=== Training SAE on layer 8 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 6926.82it/s]

Using activations from layer layer_8 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_8', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:11<00:00, 449.89it/s]

Stopped training because reached max specified steps





0,1
act_mean,▄▄▅▅▇▄▃▅▅▅▄▅▂▄▄▅▃▇▁▇▅▆▆▅▃▄▄▆▄▆▆▇▆▅▅▆▅▄▅█
act_std,▂▃▅▃▂▃▂▂▃▂▂▁▃▃▁▁▂▂▂▄▄▃▂▂▃▆▂▄▅▂▅▃█▃▃▂▄▂▅▇
frac_variance_explained,▁▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█ █▇█ █ █ █▇█
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▃▃▃▂▂▃▂▃▃▂▃▃▂▃▂▂▂▁▂▂▂▁▂▂▂▂▁▂▁▂▂▂▂▂▂▁▁▁▂
loss,█▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▃▂▁▂▂▂▃▂▂▂▂▂▁▂▂▂▂▂▂▃▃▂▁▂
lr,▁███████████████████████████████████████
mse_loss,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁

0,1
act_mean,0.00014
act_std,1.0396
frac_variance_explained,
l0,8.0
l0_pct_nonzero,0.19531
l1_penalty,0.1
l2_loss,8.16932
loss,10.03732
lr,0.001
mse_loss,66.73777


Finished training SAE on layer 8, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer9

=== Training SAE on layer 9 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7016.23it/s]

Using activations from layer layer_9 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_9', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:10<00:00, 464.32it/s]

Stopped training because reached max specified steps





0,1
act_mean,▆▄▅▆▆▅▃▃▅▄▆▂▅▄▄▆▂▅▅▅▅▆▅▆▆▁▇▄▂▅▄█▅▅▄▄▆▄▃▇
act_std,▂▄▅▂▄▂▂▃▅▃▄▁▅▃▂▂▃▄▃▃▂▂▅▃▂▅▄▃▅▄▁▆▄█▅▁▃▅▃▄
frac_variance_explained,▁▇▇▇██▇█▇█▇▇██▇████ ███ █ █ ███
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄▇█████████████████████████████████████
l2_loss,█▃▃▃▂▂▂▂▂▂▂▃▂▂▂▂▂▂▂▁▂▂▂▂▁▂▂▁▁▁▂▂▂▂▂▂▁▁▁▂
loss,█▄▄▃▃▃▃▂▃▃▃▃▃▂▃▂▂▂▂▁▂▂▂▂▁▂▂▁▁▁▂▂▂▂▂▃▂▂▁▂
lr,▁███████████████████████████████████████
mse_loss,█▂▂▂▁▁▂▁▂▁▁▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.00146
act_std,1.09142
frac_variance_explained,
l0,7.0
l0_pct_nonzero,0.1709
l1_penalty,0.1
l2_loss,7.29748
loss,9.42276
lr,0.001
mse_loss,53.25319


Finished training SAE on layer 9, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer10

=== Training SAE on layer 10 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7790.31it/s]

Using activations from layer layer_10 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_10', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:10<00:00, 464.73it/s]

Stopped training because reached max specified steps





0,1
act_mean,▆▅▅▆▆▅▃▅▅▅▆▃▄▄▆▄▅▆▇▃▅▆▅▇▄█▆▄▆▄▄▆▆▅▆▇▅▁▃▇
act_std,▃▅▆▄▆▅▅▇▄▅▄▅▄▄▃▃▆▄▂▄▆▆▂▅▅▅▃▆▅▁▇▅▇▇▇▅▅▂▆█
frac_variance_explained,▁▆▆▇▇█▇▇▇▇▇▇▇█▇█▇▇█ ███ █ ███▇
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▆██████████████████████████████████████
l2_loss,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▂▂▁▁▂▂▁▂▂▂▂▁▁▁
loss,█▃▃▃▂▂▂▂▂▂▃▃▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▂▁▁▂▂▂▂▂▂▂▁▁▁
lr,▁███████████████████████████████████████
mse_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.00741
act_std,1.17281
frac_variance_explained,
l0,8.0
l0_pct_nonzero,0.19531
l1_penalty,0.1
l2_loss,5.4065
loss,7.7136
lr,0.001
mse_loss,29.23024


Finished training SAE on layer 10, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer11

=== Training SAE on layer 11 embeddings ===
Loading dataset metadata


100%|██████████| 60/60 [00:00<00:00, 7562.07it/s]

Using activations from layer layer_11 of scgpt
Loaded dataset with 30,588,064 tokens
Training with config: {'dict_class': 'AutoEncoder', 'trainer_class': 'StandardTrainer', 'activation_dim': 512, 'dict_size': 4096, 'lr': 0.001, 'l1_penalty': 0.1, 'l1_annealing_steps': 250, 'steps': 5000, 'warmup_steps': 50, 'resample_steps': None, 'device': 'cuda', 'layer': 'layer_11', 'plm_name': 'scgpt', 'wandb_name': 'SAE', 'submodule_name': None}





100%|██████████| 5000/5000 [00:10<00:00, 455.24it/s]

Stopped training because reached max specified steps





0,1
act_mean,▆▆▅▆▆██▇█▆▆▇▇▆▆▄█▆▃▆▃▇█▄▆▁▆▅▃▅▅▄▄▆█▅▄▆▅▃
act_std,▅▁▅▂▂▃▄▂▄▂▄▅▆▅▅▃▄▄█▄▂▃▅▃▄▃▄▂▂▆▁▃▂▂▂▃▃▇▂▁
frac_variance_explained,▁▆▇▇▇▇▇▇▇▇▇▇▇▇▇██▇█ ▇▇█ █ █ ▇▇█
l0,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l0_pct_nonzero,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
l1_penalty,▁▄██████████████████████████████████████
l2_loss,█▃▃▂▂▂▂▂▂▂▃▂▂▂▂▂▂▂▁▂▂▂▁▁▂▂▁▁▁▁▂▂▂▂▂▂▂▂▁▂
loss,█▃▃▃▂▂▃▂▂▃▃▃▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁▁▁▁▂▂▂▂▂▂▂▁▁▂
lr,▁███████████████████████████████████████
mse_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
act_mean,0.01647
act_std,0.97184
frac_variance_explained,
l0,10.0
l0_pct_nonzero,0.24414
l1_penalty,0.1
l2_loss,4.72196
loss,6.72394
lr,0.001
mse_loss,22.29695


Finished training SAE on layer 11, results saved to /maiziezhou_lab2/yunfei/Projects/interpTFM/activations_cosmx_lung_cancer/sae_latents/sae_output_layer12



In [None]:
# train_SAE_on_gfm_embeds(gfm_embd_dir='/maiziezhou_lab2/yunfei/Projects/FM_temp/InterPLM/interplm/scgpt/activations/layer_4', 
#                         save_dir='/maiziezhou_lab2/yunfei/Projects/FM_temp/interGFM/sae/sae_output_layer4',
#                         wandb_entity="yunfei-hu-vanderbilt-university",
#                         steps=5000)